From 354bb40e75d94466e91fe6960523612c9d17ccfb Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Thu, 2 Nov 2017 23:11:29 +0300
Subject: Add implementation

---
 mysql/mysys/mf_keycache.c | 4051 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 4051 insertions(+)
 create mode 100644 mysql/mysys/mf_keycache.c

(limited to 'mysql/mysys/mf_keycache.c')

diff --git a/mysql/mysys/mf_keycache.c b/mysql/mysys/mf_keycache.c
new file mode 100644
index 0000000..cb712ed
--- /dev/null
+++ b/mysql/mysys/mf_keycache.c
@@ -0,0 +1,4051 @@
+/* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
+
+/**
+  @file
+  These functions handle keyblock cacheing for ISAM and MyISAM tables.
+
+  One cache can handle many files.
+  It must contain buffers of the same blocksize.
+  init_key_cache() should be used to init cache handler.
+
+  The free list (free_block_list) is a stack like structure.
+  When a block is freed by free_block(), it is pushed onto the stack.
+  When a new block is required it is first tried to pop one from the stack.
+  If the stack is empty, it is tried to get a never-used block from the pool.
+  If this is empty too, then a block is taken from the LRU ring, flushing it
+  to disk, if neccessary. This is handled in find_key_block().
+  With the new free list, the blocks can have three temperatures:
+  hot, warm and cold (which is free). This is remembered in the block header
+  by the enum BLOCK_TEMPERATURE temperature variable. Remembering the
+  temperature is neccessary to correctly count the number of warm blocks,
+  which is required to decide when blocks are allowed to become hot. Whenever
+  a block is inserted to another (sub-)chain, we take the old and new
+  temperature into account to decide if we got one more or less warm block.
+  blocks_unused is the sum of never used blocks in the pool and of currently
+  free blocks. blocks_used is the number of blocks fetched from the pool and
+  as such gives the maximum number of in-use blocks at any time.
+*/
+
+/*
+  Key Cache Locking
+  =================
+
+  All key cache locking is done with a single mutex per key cache:
+  keycache->cache_lock. This mutex is locked almost all the time
+  when executing code in this file (mf_keycache.c).
+  However it is released for I/O and some copy operations.
+
+  The cache_lock is also released when waiting for some event. Waiting
+  and signalling is done via condition variables. In most cases the
+  thread waits on its thread->suspend condition variable. Every thread
+  has a my_thread_var structure, which contains this variable and a
+  '*next' and '**prev' pointer. These pointers are used to insert the
+  thread into a wait queue.
+
+  A thread can wait for one block and thus be in one wait queue at a
+  time only.
+
+  Before starting to wait on its condition variable with
+  mysql_cond_wait(), the thread enters itself to a specific wait queue
+  with link_into_queue() (double linked with '*next' + '**prev') or
+  wait_on_queue() (single linked with '*next').
+
+  Another thread, when releasing a resource, looks up the waiting thread
+  in the related wait queue. It sends a signal with
+  mysql_cond_signal() to the waiting thread.
+
+  NOTE: Depending on the particular wait situation, either the sending
+  thread removes the waiting thread from the wait queue with
+  unlink_from_queue() or release_whole_queue() respectively, or the waiting
+  thread removes itself.
+
+  There is one exception from this locking scheme when one thread wants
+  to reuse a block for some other address. This works by first marking
+  the block reserved (status= BLOCK_IN_SWITCH) and then waiting for all
+  threads that are reading the block to finish. Each block has a
+  reference to a condition variable (condvar). It holds a reference to
+  the thread->suspend condition variable for the waiting thread (if such
+  a thread exists). When that thread is signaled, the reference is
+  cleared. The number of readers of a block is registered in
+  block->hash_link->requests. See wait_for_readers() / remove_reader()
+  for details. This is similar to the above, but it clearly means that
+  only one thread can wait for a particular block. There is no queue in
+  this case. Strangely enough block->convar is used for waiting for the
+  assigned hash_link only. More precisely it is used to wait for all
+  requests to be unregistered from the assigned hash_link.
+
+  The resize_queue serves two purposes:
+  1. Threads that want to do a resize wait there if in_resize is set.
+     This is not used in the server. The server refuses a second resize
+     request if one is already active. keycache->in_init is used for the
+     synchronization. See set_var.cc.
+  2. Threads that want to access blocks during resize wait here during
+     the re-initialization phase.
+  When the resize is done, all threads on the queue are signalled.
+  Hypothetical resizers can compete for resizing, and read/write
+  requests will restart to request blocks from the freshly resized
+  cache. If the cache has been resized too small, it is disabled and
+  'can_be_used' is false. In this case read/write requests bypass the
+  cache. Since they increment and decrement 'cnt_for_resize_op', the
+  next resizer can wait on the queue 'waiting_for_resize_cnt' until all
+  I/O finished.
+*/
+
+#include "mysys_priv.h"
+#include "mysys_err.h"
+#include <keycache.h>
+#include "my_static.h"
+#include <m_string.h>
+#include <my_bit.h>
+#include <errno.h>
+#include <stdarg.h>
+#include "probes_mysql.h"
+#include "my_thread_local.h"
+
+#define STRUCT_PTR(TYPE, MEMBER, a)                                           \
+          (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
+
+/* types of condition variables */
+#define  COND_FOR_REQUESTED 0
+#define  COND_FOR_SAVED     1
+#define  COND_FOR_READERS   2
+
+typedef mysql_cond_t KEYCACHE_CONDVAR;
+
+/* descriptor of the page in the key cache block buffer */
+struct st_keycache_page
+{
+  int file;               /* file to which the page belongs to  */
+  my_off_t filepos;       /* position of the page in the file   */
+};
+typedef struct st_keycache_page KEYCACHE_PAGE;
+
+/* element in the chain of a hash table bucket */
+struct st_hash_link
+{
+  struct st_hash_link *next, **prev; /* to connect links in the same bucket  */
+  struct st_block_link *block;       /* reference to the block for the page: */
+  File file;                         /* from such a file                     */
+  my_off_t diskpos;                  /* with such an offset                  */
+  uint requests;                     /* number of requests for the page      */
+};
+
+/* simple states of a block */
+#define BLOCK_ERROR           1 /* an error occured when performing file i/o */
+#define BLOCK_READ            2 /* file block is in the block buffer         */
+#define BLOCK_IN_SWITCH       4 /* block is preparing to read new page       */
+#define BLOCK_REASSIGNED      8 /* blk does not accept requests for old page */
+#define BLOCK_IN_FLUSH       16 /* block is selected for flush               */
+#define BLOCK_CHANGED        32 /* block buffer contains a dirty page        */
+#define BLOCK_IN_USE         64 /* block is not free                         */
+#define BLOCK_IN_EVICTION   128 /* block is selected for eviction            */
+#define BLOCK_IN_FLUSHWRITE 256 /* block is in write to file                 */
+#define BLOCK_FOR_UPDATE    512 /* block is selected for buffer modification */
+
+/* page status, returned by find_key_block */
+#define PAGE_READ               0
+#define PAGE_TO_BE_READ         1
+#define PAGE_WAIT_TO_BE_READ    2
+
+/* block temperature determines in which (sub-)chain the block currently is */
+enum BLOCK_TEMPERATURE { BLOCK_COLD /*free*/ , BLOCK_WARM , BLOCK_HOT };
+
+/* key cache block */
+struct st_block_link
+{
+  struct st_block_link
+    *next_used, **prev_used;   /* to connect links in the LRU chain (ring)   */
+  struct st_block_link
+    *next_changed, **prev_changed; /* for lists of file dirty/clean blocks   */
+  struct st_hash_link *hash_link; /* backward ptr to referring hash_link     */
+  KEYCACHE_WQUEUE wqueue[2]; /* queues on waiting requests for new/old pages */
+  uint requests;          /* number of requests for the block                */
+  uchar *buffer;           /* buffer for the block page                       */
+  uint offset;            /* beginning of modified data in the buffer        */
+  uint length;            /* end of data in the buffer                       */
+  uint status;            /* state of the block                              */
+  enum BLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot */
+  uint hits_left;         /* number of hits left until promotion             */
+  ulonglong last_hit_time; /* timestamp of the last hit                      */
+  KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event    */
+};
+
+KEY_CACHE dflt_key_cache_var;
+KEY_CACHE *dflt_key_cache= &dflt_key_cache_var;
+
+#define FLUSH_CACHE         2000            /* sort this many blocks at once */
+
+static void change_key_cache_param(KEY_CACHE *keycache,
+                                   ulonglong division_limit,
+                                   ulonglong age_threshold);
+static int flush_all_key_blocks(KEY_CACHE *keycache,
+                                st_keycache_thread_var *thread_var);
+
+static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
+                          mysql_mutex_t *mutex,
+                          st_keycache_thread_var *thread);
+static void release_whole_queue(KEYCACHE_WQUEUE *wqueue);
+
+static void free_block(KEY_CACHE *keycache,
+                       st_keycache_thread_var *thread_var,
+                       BLOCK_LINK *block);
+
+#define KEYCACHE_HASH(f, pos)                                                 \
+(((ulong) ((pos) / keycache->key_cache_block_size) +                          \
+                                     (ulong) (f)) & (keycache->hash_entries-1))
+#define FILE_HASH(f)                 ((uint) (f) & (CHANGED_BLOCKS_HASH-1))
+
+#define BLOCK_NUMBER(b)                                                       \
+  ((uint) (((char*)(b)-(char *) keycache->block_root)/sizeof(BLOCK_LINK)))
+#define HASH_LINK_NUMBER(h)                                                   \
+  ((uint) (((char*)(h)-(char *) keycache->hash_link_root)/sizeof(HASH_LINK)))
+
+#if !defined(DBUG_OFF)
+static int fail_block(BLOCK_LINK *block);
+static int fail_hlink(HASH_LINK *hlink);
+static int cache_empty(KEY_CACHE *keycache);
+#endif
+
+static inline uint next_power(uint value)
+{
+  return (uint) my_round_up_to_next_power((uint32) value) << 1;
+}
+
+
+/*
+  Initialize a key cache
+
+  SYNOPSIS
+    init_key_cache()
+    keycache			pointer to a key cache data structure
+    key_cache_block_size	size of blocks to keep cached data
+    use_mem                 	total memory to use for the key cache
+    division_limit		division limit (may be zero)
+    age_threshold		age threshold (may be zero)
+
+  RETURN VALUE
+    number of blocks in the key cache, if successful,
+    0 - otherwise.
+
+  NOTES.
+    if keycache->key_cache_inited != 0 we assume that the key cache
+    is already initialized.  This is for now used by myisamchk, but shouldn't
+    be something that a program should rely on!
+
+    It's assumed that no two threads call this function simultaneously
+    referring to the same key cache handle.
+
+*/
+
+int init_key_cache(KEY_CACHE *keycache, ulonglong key_cache_block_size,
+                   size_t use_mem, ulonglong division_limit,
+                   ulonglong age_threshold)
+{
+  ulong blocks, hash_links;
+  size_t length;
+  int error;
+  DBUG_ENTER("init_key_cache");
+  DBUG_ASSERT(key_cache_block_size >= 512);
+
+  if (keycache->key_cache_inited && keycache->disk_blocks > 0)
+  {
+    DBUG_PRINT("warning",("key cache already in use"));
+    DBUG_RETURN(0);
+  }
+
+  keycache->global_cache_w_requests= keycache->global_cache_r_requests= 0;
+  keycache->global_cache_read= keycache->global_cache_write= 0;
+  keycache->disk_blocks= -1;
+  if (! keycache->key_cache_inited)
+  {
+    keycache->key_cache_inited= 1;
+    /*
+      Initialize these variables once only.
+      Their value must survive re-initialization during resizing.
+    */
+    keycache->in_resize= 0;
+    keycache->resize_in_flush= 0;
+    keycache->cnt_for_resize_op= 0;
+    keycache->waiting_for_resize_cnt.last_thread= NULL;
+    keycache->in_init= 0;
+    mysql_mutex_init(key_KEY_CACHE_cache_lock,
+                     &keycache->cache_lock, MY_MUTEX_INIT_FAST);
+    keycache->resize_queue.last_thread= NULL;
+  }
+
+  keycache->key_cache_mem_size= use_mem;
+  keycache->key_cache_block_size= (uint)key_cache_block_size;
+  DBUG_PRINT("info", ("key_cache_block_size: %llu",
+		      key_cache_block_size));
+
+  blocks= (ulong) (use_mem / (sizeof(BLOCK_LINK) + 2 * sizeof(HASH_LINK) +
+                              sizeof(HASH_LINK*) * 5/4 + key_cache_block_size));
+  /* It doesn't make sense to have too few blocks (less than 8) */
+  if (blocks >= 8)
+  {
+    for ( ; ; )
+    {
+      /* Set my_hash_entries to the next bigger 2 power */
+      if ((keycache->hash_entries= next_power(blocks)) < blocks * 5/4)
+        keycache->hash_entries<<= 1;
+      hash_links= 2 * blocks;
+      while ((length= (ALIGN_SIZE(blocks * sizeof(BLOCK_LINK)) +
+		       ALIGN_SIZE(hash_links * sizeof(HASH_LINK)) +
+		       ALIGN_SIZE(sizeof(HASH_LINK*) *
+                                  keycache->hash_entries))) +
+	     ((size_t) blocks * keycache->key_cache_block_size) > use_mem)
+        blocks--;
+      /* Allocate memory for cache page buffers */
+      if ((keycache->block_mem=
+	   my_large_malloc(key_memory_KEY_CACHE,
+                           (size_t) blocks * keycache->key_cache_block_size,
+			  MYF(0))))
+      {
+        /*
+	  Allocate memory for blocks, hash_links and hash entries;
+	  For each block 2 hash links are allocated
+        */
+        if ((keycache->block_root= (BLOCK_LINK*) my_malloc(key_memory_KEY_CACHE,
+                                                           length,
+                                                           MYF(0))))
+          break;
+        my_large_free(keycache->block_mem);
+        keycache->block_mem= 0;
+      }
+      if (blocks < 8)
+      {
+        set_my_errno(ENOMEM);
+        my_error(EE_OUTOFMEMORY, MYF(ME_FATALERROR),
+                 blocks * keycache->key_cache_block_size);
+        goto err;
+      }
+      blocks= blocks / 4*3;
+    }
+    keycache->blocks_unused= blocks;
+    keycache->disk_blocks= (int) blocks;
+    keycache->hash_links= hash_links;
+    keycache->hash_root= (HASH_LINK**) ((char*) keycache->block_root +
+				        ALIGN_SIZE(blocks*sizeof(BLOCK_LINK)));
+    keycache->hash_link_root= (HASH_LINK*) ((char*) keycache->hash_root +
+				            ALIGN_SIZE((sizeof(HASH_LINK*) *
+							keycache->hash_entries)));
+    memset(keycache->block_root, 0,
+	  keycache->disk_blocks * sizeof(BLOCK_LINK));
+    memset(keycache->hash_root, 0,
+          keycache->hash_entries * sizeof(HASH_LINK*));
+    memset(keycache->hash_link_root, 0,
+	  keycache->hash_links * sizeof(HASH_LINK));
+    keycache->hash_links_used= 0;
+    keycache->free_hash_list= NULL;
+    keycache->blocks_used= keycache->blocks_changed= 0;
+
+    keycache->global_blocks_changed= 0;
+    keycache->blocks_available=0;		/* For debugging */
+
+    /* The LRU chain is empty after initialization */
+    keycache->used_last= NULL;
+    keycache->used_ins= NULL;
+    keycache->free_block_list= NULL;
+    keycache->keycache_time= 0;
+    keycache->warm_blocks= 0;
+    keycache->min_warm_blocks= (division_limit ?
+				blocks * division_limit / 100 + 1 :
+				blocks);
+    keycache->age_threshold= (age_threshold ?
+			      blocks * age_threshold / 100 :
+			      blocks);
+
+    keycache->can_be_used= 1;
+
+    keycache->waiting_for_hash_link.last_thread= NULL;
+    keycache->waiting_for_block.last_thread= NULL;
+    DBUG_PRINT("exit",
+	       ("disk_blocks: %d  block_root: 0x%lx  hash_entries: %d\
+ hash_root: 0x%lx  hash_links: %d  hash_link_root: 0x%lx",
+		keycache->disk_blocks,  (long) keycache->block_root,
+		keycache->hash_entries, (long) keycache->hash_root,
+		keycache->hash_links,   (long) keycache->hash_link_root));
+    memset(keycache->changed_blocks, 0,
+	  sizeof(keycache->changed_blocks[0]) * CHANGED_BLOCKS_HASH);
+    memset(keycache->file_blocks, 0,
+	  sizeof(keycache->file_blocks[0]) * CHANGED_BLOCKS_HASH);
+  }
+  else
+  {
+    /* key_buffer_size is specified too small. Disable the cache. */
+    keycache->can_be_used= 0;
+  }
+
+  keycache->blocks= keycache->disk_blocks > 0 ? keycache->disk_blocks : 0;
+  DBUG_RETURN((int) keycache->disk_blocks);
+
+err:
+  error= my_errno();
+  keycache->disk_blocks= 0;
+  keycache->blocks=  0;
+  if (keycache->block_mem)
+  {
+    my_large_free((uchar*) keycache->block_mem);
+    keycache->block_mem= NULL;
+  }
+  if (keycache->block_root)
+  {
+    my_free(keycache->block_root);
+    keycache->block_root= NULL;
+  }
+  set_my_errno(error);
+  keycache->can_be_used= 0;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Resize a key cache
+
+  SYNOPSIS
+    resize_key_cache()
+    keycache     	        pointer to a key cache data structure
+    thread_var                  pointer to thread specific variables
+    key_cache_block_size        size of blocks to keep cached data
+    use_mem			total memory to use for the new key cache
+    division_limit		new division limit (if not zero)
+    age_threshold		new age threshold (if not zero)
+
+  RETURN VALUE
+    number of blocks in the key cache, if successful,
+    0 - otherwise.
+
+  NOTES.
+    The function first compares the memory size and the block size parameters
+    with the key cache values.
+
+    If they differ the function free the the memory allocated for the
+    old key cache blocks by calling the end_key_cache function and
+    then rebuilds the key cache with new blocks by calling
+    init_key_cache.
+
+    The function starts the operation only when all other threads
+    performing operations with the key cache let her to proceed
+    (when cnt_for_resize=0).
+*/
+
+int resize_key_cache(KEY_CACHE *keycache,
+                     st_keycache_thread_var *thread_var,
+                     ulonglong key_cache_block_size,
+                     size_t use_mem, ulonglong division_limit,
+                     ulonglong age_threshold)
+{
+  int blocks;
+  DBUG_ENTER("resize_key_cache");
+
+  if (!keycache->key_cache_inited)
+    DBUG_RETURN(keycache->disk_blocks);
+
+  if(key_cache_block_size == keycache->key_cache_block_size &&
+     use_mem == keycache->key_cache_mem_size)
+  {
+    change_key_cache_param(keycache, division_limit, age_threshold);
+    DBUG_RETURN(keycache->disk_blocks);
+  }
+
+  mysql_mutex_lock(&keycache->cache_lock);
+
+  /*
+    We may need to wait for another thread which is doing a resize
+    already. This cannot happen in the MySQL server though. It allows
+    one resizer only. In set_var.cc keycache->in_init is used to block
+    multiple attempts.
+  */
+  while (keycache->in_resize)
+  {
+    /* purecov: begin inspected */
+    wait_on_queue(&keycache->resize_queue, &keycache->cache_lock,
+                  thread_var);
+    /* purecov: end */
+  }
+
+  /*
+    Mark the operation in progress. This blocks other threads from doing
+    a resize in parallel. It prohibits new blocks to enter the cache.
+    Read/write requests can bypass the cache during the flush phase.
+  */
+  keycache->in_resize= 1;
+
+  /* Need to flush only if keycache is enabled. */
+  if (keycache->can_be_used)
+  {
+    /* Start the flush phase. */
+    keycache->resize_in_flush= 1;
+
+    if (flush_all_key_blocks(keycache, thread_var))
+    {
+      /* TODO: if this happens, we should write a warning in the log file ! */
+      keycache->resize_in_flush= 0;
+      blocks= 0;
+      keycache->can_be_used= 0;
+      goto finish;
+    }
+    DBUG_ASSERT(cache_empty(keycache));
+
+    /* End the flush phase. */
+    keycache->resize_in_flush= 0;
+  }
+
+  /*
+    Some direct read/write operations (bypassing the cache) may still be
+    unfinished. Wait until they are done. If the key cache can be used,
+    direct I/O is done in increments of key_cache_block_size. That is,
+    every block is checked if it is in the cache. We need to wait for
+    pending I/O before re-initializing the cache, because we may change
+    the block size. Otherwise they could check for blocks at file
+    positions where the new block division has none. We do also want to
+    wait for I/O done when (if) the cache was disabled. It must not
+    run in parallel with normal cache operation.
+  */
+  while (keycache->cnt_for_resize_op)
+    wait_on_queue(&keycache->waiting_for_resize_cnt, &keycache->cache_lock,
+                  thread_var);
+
+  /*
+    Free old cache structures, allocate new structures, and initialize
+    them. Note that the cache_lock mutex and the resize_queue are left
+    untouched. We do not lose the cache_lock and will release it only at
+    the end of this function.
+  */
+  end_key_cache(keycache, 0);			/* Don't free mutex */
+  /* The following will work even if use_mem is 0 */
+  blocks= init_key_cache(keycache, key_cache_block_size, use_mem,
+			 division_limit, age_threshold);
+
+finish:
+  /*
+    Mark the resize finished. This allows other threads to start a
+    resize or to request new cache blocks.
+  */
+  keycache->in_resize= 0;
+
+  /* Signal waiting threads. */
+  release_whole_queue(&keycache->resize_queue);
+
+  mysql_mutex_unlock(&keycache->cache_lock);
+  DBUG_RETURN(blocks);
+}
+
+
+/*
+  Increment counter blocking resize key cache operation
+*/
+static inline void inc_counter_for_resize_op(KEY_CACHE *keycache)
+{
+  keycache->cnt_for_resize_op++;
+}
+
+
+/*
+  Decrement counter blocking resize key cache operation;
+  Signal the operation to proceed when counter becomes equal zero
+*/
+static inline void dec_counter_for_resize_op(KEY_CACHE *keycache)
+{
+  if (!--keycache->cnt_for_resize_op)
+    release_whole_queue(&keycache->waiting_for_resize_cnt);
+}
+
+/*
+  Change the key cache parameters
+
+  SYNOPSIS
+    change_key_cache_param()
+    keycache			pointer to a key cache data structure
+    division_limit		new division limit (if not zero)
+    age_threshold		new age threshold (if not zero)
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Presently the function resets the key cache parameters
+    concerning midpoint insertion strategy - division_limit and
+    age_threshold.
+*/
+
+static void change_key_cache_param(KEY_CACHE *keycache,
+                                   ulonglong division_limit,
+                                   ulonglong age_threshold)
+{
+  DBUG_ENTER("change_key_cache_param");
+
+  mysql_mutex_lock(&keycache->cache_lock);
+  if (division_limit)
+    keycache->min_warm_blocks= (keycache->disk_blocks *
+				division_limit / 100 + 1);
+  if (age_threshold)
+    keycache->age_threshold=   (keycache->disk_blocks *
+				age_threshold / 100);
+  mysql_mutex_unlock(&keycache->cache_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Remove key_cache from memory
+
+  SYNOPSIS
+    end_key_cache()
+    keycache		key cache handle
+    cleanup		Complete free (Free also mutex for key cache)
+
+  RETURN VALUE
+    none
+*/
+
+void end_key_cache(KEY_CACHE *keycache, my_bool cleanup)
+{
+  DBUG_ENTER("end_key_cache");
+  DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) keycache));
+
+  if (!keycache->key_cache_inited)
+    DBUG_VOID_RETURN;
+
+  if (keycache->disk_blocks > 0)
+  {
+    if (keycache->block_mem)
+    {
+      my_large_free((uchar*) keycache->block_mem);
+      keycache->block_mem= NULL;
+      my_free(keycache->block_root);
+      keycache->block_root= NULL;
+    }
+    keycache->disk_blocks= -1;
+    /* Reset blocks_changed to be safe if flush_all_key_blocks is called */
+    keycache->blocks_changed= 0;
+  }
+
+  DBUG_PRINT("status", ("used: %lu  changed: %lu  w_requests: %lu  "
+                        "writes: %lu  r_requests: %lu  reads: %lu",
+                        keycache->blocks_used, keycache->global_blocks_changed,
+                        (ulong) keycache->global_cache_w_requests,
+                        (ulong) keycache->global_cache_write,
+                        (ulong) keycache->global_cache_r_requests,
+                        (ulong) keycache->global_cache_read));
+
+  /*
+    Reset these values to be able to detect a disabled key cache.
+    See Bug#44068 (RESTORE can disable the MyISAM Key Cache).
+  */
+  keycache->blocks_used= 0;
+  keycache->blocks_unused= 0;
+
+  if (cleanup)
+  {
+    mysql_mutex_destroy(&keycache->cache_lock);
+    keycache->key_cache_inited= keycache->can_be_used= 0;
+  }
+  DBUG_VOID_RETURN;
+} /* end_key_cache */
+
+
+/**
+  Link a thread into double-linked queue of waiting threads.
+
+  @param wqueue   pointer to the queue structure
+  @param thread   pointer to the keycache variables for the
+                  thread to be added to the queue
+
+  Queue is represented by a circular list of the keycache variable structures.
+  Since each thread has its own keycache variables, this is equal to a list
+  of threads. The list is double-linked of the type (**prev,*next), accessed by
+  a pointer to the last element.
+*/
+
+static void link_into_queue(KEYCACHE_WQUEUE *wqueue,
+                            st_keycache_thread_var *thread)
+{
+  st_keycache_thread_var *last;
+
+  DBUG_ASSERT(!thread->next && !thread->prev);
+  if (! (last= wqueue->last_thread))
+  {
+    /* Queue is empty */
+    thread->next= thread;
+    thread->prev= &thread->next;
+  }
+  else
+  {
+    thread->prev= last->next->prev;
+    last->next->prev= &thread->next;
+    thread->next= last->next;
+    last->next= thread;
+  }
+  wqueue->last_thread= thread;
+}
+
+
+/**
+  Unlink a thread from double-linked queue of waiting threads
+
+  @param wqueue   pointer to the queue structure
+  @param thread   pointer to the keycache variables for the
+                  thread to be removed to the queue
+
+  @note See link_into_queue
+*/
+
+static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue,
+                              st_keycache_thread_var *thread)
+{
+  DBUG_ASSERT(thread->next && thread->prev);
+  if (thread->next == thread)
+    /* The queue contains only one member */
+    wqueue->last_thread= NULL;
+  else
+  {
+    thread->next->prev= thread->prev;
+    *thread->prev=thread->next;
+    if (wqueue->last_thread == thread)
+      wqueue->last_thread= STRUCT_PTR(st_keycache_thread_var, next,
+                                      thread->prev);
+  }
+  thread->next= NULL;
+#if !defined(DBUG_OFF)
+  /*
+    This makes it easier to see it's not in a chain during debugging.
+    And some DBUG_ASSERT() rely on it.
+  */
+  thread->prev= NULL;
+#endif
+}
+
+
+/*
+  Add a thread to single-linked queue of waiting threads
+
+  SYNOPSIS
+    wait_on_queue()
+      wqueue            Pointer to the queue structure.
+      mutex             Cache_lock to acquire after awake.
+      thread            Thread to be added
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Queue is represented by a circular list of the thread structures
+    The list is single-linked of the type (*next), accessed by a pointer
+    to the last element.
+
+    The function protects against stray signals by verifying that the
+    current thread is unlinked from the queue when awaking. However,
+    since several threads can wait for the same event, it might be
+    necessary for the caller of the function to check again if the
+    condition for awake is indeed matched.
+*/
+
+static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
+                          mysql_mutex_t *mutex,
+                          st_keycache_thread_var *thread)
+{
+  st_keycache_thread_var *last;
+
+  /* Add to queue. */
+  DBUG_ASSERT(!thread->next);
+  DBUG_ASSERT(!thread->prev); /* Not required, but must be true anyway. */
+  if (! (last= wqueue->last_thread))
+    thread->next= thread;
+  else
+  {
+    thread->next= last->next;
+    last->next= thread;
+  }
+  wqueue->last_thread= thread;
+
+  /*
+    Wait until thread is removed from queue by the signalling thread.
+    The loop protects against stray signals.
+  */
+  do
+  {
+    mysql_cond_wait(&thread->suspend, mutex);
+  }
+  while (thread->next);
+}
+
+
+/*
+  Remove all threads from queue signaling them to proceed
+
+  SYNOPSIS
+    release_whole_queue()
+      wqueue            pointer to the queue structure
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    See notes for wait_on_queue().
+    When removed from the queue each thread is signaled via condition
+    variable thread->suspend.
+*/
+
+static void release_whole_queue(KEYCACHE_WQUEUE *wqueue)
+{
+  st_keycache_thread_var *last;
+  st_keycache_thread_var *next;
+  st_keycache_thread_var *thread;
+
+  /* Queue may be empty. */
+  if (!(last= wqueue->last_thread))
+    return;
+
+  next= last->next;
+  do
+  {
+    thread=next;
+    /* Signal the thread. */
+    mysql_cond_signal(&thread->suspend);
+    /* Take thread from queue. */
+    next=thread->next;
+    thread->next= NULL;
+  }
+  while (thread != last);
+
+  /* Now queue is definitely empty. */
+  wqueue->last_thread= NULL;
+}
+
+
+/*
+  Unlink a block from the chain of dirty/clean blocks
+*/
+
+static inline void unlink_changed(BLOCK_LINK *block)
+{
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  if (block->next_changed)
+    block->next_changed->prev_changed= block->prev_changed;
+  *block->prev_changed= block->next_changed;
+
+#if !defined(DBUG_OFF)
+  /*
+    This makes it easier to see it's not in a chain during debugging.
+    And some DBUG_ASSERT() rely on it.
+  */
+  block->next_changed= NULL;
+  block->prev_changed= NULL;
+#endif
+}
+
+
+/*
+  Link a block into the chain of dirty/clean blocks
+*/
+
+static inline void link_changed(BLOCK_LINK *block, BLOCK_LINK **phead)
+{
+  DBUG_ASSERT(!block->next_changed);
+  DBUG_ASSERT(!block->prev_changed);
+  block->prev_changed= phead;
+  if ((block->next_changed= *phead))
+    (*phead)->prev_changed= &block->next_changed;
+  *phead= block;
+}
+
+
+/*
+  Link a block in a chain of clean blocks of a file.
+
+  SYNOPSIS
+    link_to_file_list()
+      keycache		Key cache handle
+      block             Block to relink
+      file              File to be linked to
+      unlink            If to unlink first
+
+  DESCRIPTION
+    Unlink a block from whichever chain it is linked in, if it's
+    asked for, and link it to the chain of clean blocks of the
+    specified file.
+
+  NOTE
+    Please do never set/clear BLOCK_CHANGED outside of
+    link_to_file_list() or link_to_changed_list().
+    You would risk to damage correct counting of changed blocks
+    and to find blocks in the wrong hash.
+
+  RETURN
+    void
+*/
+
+static void link_to_file_list(KEY_CACHE *keycache,
+                              BLOCK_LINK *block, int file,
+                              my_bool unlink_block)
+{
+  DBUG_ASSERT(block->status & BLOCK_IN_USE);
+  DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
+  DBUG_ASSERT(block->hash_link->file == file);
+  if (unlink_block)
+    unlink_changed(block);
+  link_changed(block, &keycache->file_blocks[FILE_HASH(file)]);
+  if (block->status & BLOCK_CHANGED)
+  {
+    block->status&= ~BLOCK_CHANGED;
+    keycache->blocks_changed--;
+    keycache->global_blocks_changed--;
+  }
+}
+
+
+/*
+  Re-link a block from the clean chain to the dirty chain of a file.
+
+  SYNOPSIS
+    link_to_changed_list()
+      keycache		key cache handle
+      block             block to relink
+
+  DESCRIPTION
+    Unlink a block from the chain of clean blocks of a file
+    and link it to the chain of dirty blocks of the same file.
+
+  NOTE
+    Please do never set/clear BLOCK_CHANGED outside of
+    link_to_file_list() or link_to_changed_list().
+    You would risk to damage correct counting of changed blocks
+    and to find blocks in the wrong hash.
+
+  RETURN
+    void
+*/
+
+static void link_to_changed_list(KEY_CACHE *keycache,
+                                 BLOCK_LINK *block)
+{
+  DBUG_ASSERT(block->status & BLOCK_IN_USE);
+  DBUG_ASSERT(!(block->status & BLOCK_CHANGED));
+  DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
+
+  unlink_changed(block);
+  link_changed(block,
+               &keycache->changed_blocks[FILE_HASH(block->hash_link->file)]);
+  block->status|=BLOCK_CHANGED;
+  keycache->blocks_changed++;
+  keycache->global_blocks_changed++;
+}
+
+
+/*
+  Link a block to the LRU chain at the beginning or at the end of
+  one of two parts.
+
+  SYNOPSIS
+    link_block()
+      keycache            pointer to a key cache data structure
+      block               pointer to the block to link to the LRU chain
+      hot                 <-> to link the block into the hot subchain
+      at_end              <-> to link the block at the end of the subchain
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    The LRU ring is represented by a circular list of block structures.
+    The list is double-linked of the type (**prev,*next) type.
+    The LRU ring is divided into two parts - hot and warm.
+    There are two pointers to access the last blocks of these two
+    parts. The beginning of the warm part follows right after the
+    end of the hot part.
+    Only blocks of the warm part can be used for eviction.
+    The first block from the beginning of this subchain is always
+    taken for eviction (keycache->last_used->next)
+
+    LRU chain:       +------+   H O T    +------+
+                +----| end  |----...<----| beg  |----+
+                |    +------+last        +------+    |
+                v<-link in latest hot (new end)      |
+                |     link in latest warm (new end)->^
+                |    +------+  W A R M   +------+    |
+                +----| beg  |---->...----| end  |----+
+                     +------+            +------+ins
+                  first for eviction
+
+    It is also possible that the block is selected for eviction and thus
+    not linked in the LRU ring.
+*/
+
+static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot,
+                       my_bool at_end)
+{
+  BLOCK_LINK *ins;
+  BLOCK_LINK **pins;
+
+  DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
+  DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
+  DBUG_ASSERT(!block->requests);
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  DBUG_ASSERT(!block->next_used);
+  DBUG_ASSERT(!block->prev_used);
+
+  if (!hot && keycache->waiting_for_block.last_thread)
+  {
+    /* Signal that in the LRU warm sub-chain an available block has appeared */
+    st_keycache_thread_var *last_thread=
+      keycache->waiting_for_block.last_thread;
+    st_keycache_thread_var *first_thread= last_thread->next;
+    st_keycache_thread_var *next_thread= first_thread;
+    HASH_LINK *hash_link= (HASH_LINK *) first_thread->opt_info;
+    st_keycache_thread_var *thread;
+    do
+    {
+      thread= next_thread;
+      next_thread= thread->next;
+      /*
+         We notify about the event all threads that ask
+         for the same page as the first thread in the queue
+      */
+      if ((HASH_LINK *) thread->opt_info == hash_link)
+      {
+        mysql_cond_signal(&thread->suspend);
+        unlink_from_queue(&keycache->waiting_for_block, thread);
+        block->requests++;
+      }
+    }
+    while (thread != last_thread);
+    hash_link->block= block;
+    /*
+      NOTE: We assigned the block to the hash_link and signalled the
+      requesting thread(s). But it is possible that other threads runs
+      first. These threads see the hash_link assigned to a block which
+      is assigned to another hash_link and not marked BLOCK_IN_SWITCH.
+      This can be a problem for functions that do not select the block
+      via its hash_link: flush and free. They do only see a block which
+      is in a "normal" state and don't know that it will be evicted soon.
+
+      We cannot set BLOCK_IN_SWITCH here because only one of the
+      requesting threads must handle the eviction. All others must wait
+      for it to complete. If we set the flag here, the threads would not
+      know who is in charge of the eviction. Without the flag, the first
+      thread takes the stick and sets the flag.
+
+      But we need to note in the block that is has been selected for
+      eviction. It must not be freed. The evicting thread will not
+      expect the block in the free list. Before freeing we could also
+      check if block->requests > 1. But I think including another flag
+      in the check of block->status is slightly more efficient and
+      probably easier to read.
+    */
+    block->status|= BLOCK_IN_EVICTION;
+    return;
+  }
+
+  pins= hot ? &keycache->used_ins : &keycache->used_last;
+  ins= *pins;
+  if (ins)
+  {
+    ins->next_used->prev_used= &block->next_used;
+    block->next_used= ins->next_used;
+    block->prev_used= &ins->next_used;
+    ins->next_used= block;
+    if (at_end)
+      *pins= block;
+  }
+  else
+  {
+    /* The LRU ring is empty. Let the block point to itself. */
+    keycache->used_last= keycache->used_ins= block->next_used= block;
+    block->prev_used= &block->next_used;
+  }
+  DBUG_ASSERT((ulong) keycache->blocks_available <=
+              keycache->blocks_used);
+}
+
+
+/*
+  Unlink a block from the LRU chain
+
+  SYNOPSIS
+    unlink_block()
+      keycache            pointer to a key cache data structure
+      block               pointer to the block to unlink from the LRU chain
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    See NOTES for link_block
+*/
+
+static void unlink_block(KEY_CACHE *keycache, BLOCK_LINK *block)
+{
+  DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
+  DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
+  DBUG_ASSERT(!block->requests);
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  DBUG_ASSERT(block->next_used && block->prev_used &&
+              (block->next_used->prev_used == &block->next_used) &&
+              (*block->prev_used == block));
+  if (block->next_used == block)
+    /* The list contains only one member */
+    keycache->used_last= keycache->used_ins= NULL;
+  else
+  {
+    block->next_used->prev_used= block->prev_used;
+    *block->prev_used= block->next_used;
+    if (keycache->used_last == block)
+      keycache->used_last= STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used);
+    if (keycache->used_ins == block)
+      keycache->used_ins=STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used);
+  }
+  block->next_used= NULL;
+#if !defined(DBUG_OFF)
+  /*
+    This makes it easier to see it's not in a chain during debugging.
+    And some DBUG_ASSERT() rely on it.
+  */
+  block->prev_used= NULL;
+#endif
+}
+
+
+/*
+  Register requests for a block.
+
+  SYNOPSIS
+    reg_requests()
+      keycache          Pointer to a key cache data structure.
+      block             Pointer to the block to register a request on.
+      count             Number of requests. Always 1.
+
+  NOTE
+    The first request unlinks the block from the LRU ring. This means
+    that it is protected against eveiction.
+
+  RETURN
+    void
+*/
+static void reg_requests(KEY_CACHE *keycache, BLOCK_LINK *block, int count)
+{
+  DBUG_ASSERT(block->status & BLOCK_IN_USE);
+  DBUG_ASSERT(block->hash_link);
+
+  if (!block->requests)
+    unlink_block(keycache, block);
+  block->requests+=count;
+}
+
+
+/*
+  Unregister request for a block
+  linking it to the LRU chain if it's the last request
+
+  SYNOPSIS
+    unreg_request()
+    keycache            pointer to a key cache data structure
+    block               pointer to the block to link to the LRU chain
+    at_end              <-> to link the block at the end of the LRU chain
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Every linking to the LRU ring decrements by one a special block
+    counter (if it's positive). If the at_end parameter is TRUE the block is
+    added either at the end of warm sub-chain or at the end of hot sub-chain.
+    It is added to the hot subchain if its counter is zero and number of
+    blocks in warm sub-chain is not less than some low limit (determined by
+    the division_limit parameter). Otherwise the block is added to the warm
+    sub-chain. If the at_end parameter is FALSE the block is always added
+    at beginning of the warm sub-chain.
+    Thus a warm block can be promoted to the hot sub-chain when its counter
+    becomes zero for the first time.
+    At the same time  the block at the very beginning of the hot subchain
+    might be moved to the beginning of the warm subchain if it stays untouched
+    for a too long time (this time is determined by parameter age_threshold).
+
+    It is also possible that the block is selected for eviction and thus
+    not linked in the LRU ring.
+*/
+
+static void unreg_request(KEY_CACHE *keycache,
+                          BLOCK_LINK *block, int at_end)
+{
+  DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+  DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
+  DBUG_ASSERT(block->requests);
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  DBUG_ASSERT(!block->next_used);
+  DBUG_ASSERT(!block->prev_used);
+  /*
+    Unregister the request, but do not link erroneous blocks into the
+    LRU ring.
+  */
+  if (!--block->requests && !(block->status & BLOCK_ERROR))
+  {
+    my_bool hot;
+    if (block->hits_left)
+      block->hits_left--;
+    hot= !block->hits_left && at_end &&
+      keycache->warm_blocks > keycache->min_warm_blocks;
+    if (hot)
+    {
+      if (block->temperature == BLOCK_WARM)
+        keycache->warm_blocks--;
+      block->temperature= BLOCK_HOT;
+    }
+    link_block(keycache, block, hot, (my_bool)at_end);
+    block->last_hit_time= keycache->keycache_time;
+    keycache->keycache_time++;
+    /*
+      At this place, the block might be in the LRU ring or not. If an
+      evicter was waiting for a block, it was selected for eviction and
+      not linked in the LRU ring.
+    */
+
+    /*
+      Check if we should link a hot block to the warm block sub-chain.
+      It is possible that we select the same block as above. But it can
+      also be another block. In any case a block from the LRU ring is
+      selected. In other words it works even if the above block was
+      selected for eviction and not linked in the LRU ring. Since this
+      happens only if the LRU ring is empty, the block selected below
+      would be NULL and the rest of the function skipped.
+    */
+    block= keycache->used_ins;
+    if (block && keycache->keycache_time - block->last_hit_time >
+	keycache->age_threshold)
+    {
+      unlink_block(keycache, block);
+      link_block(keycache, block, 0, 0);
+      if (block->temperature != BLOCK_WARM)
+      {
+        keycache->warm_blocks++;
+        block->temperature= BLOCK_WARM;
+      }
+    }
+  }
+}
+
+/*
+  Remove a reader of the page in block
+*/
+
+static void remove_reader(BLOCK_LINK *block)
+{
+  DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+  DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  DBUG_ASSERT(!block->next_used);
+  DBUG_ASSERT(!block->prev_used);
+  DBUG_ASSERT(block->hash_link->requests);
+
+  if (! --block->hash_link->requests && block->condvar)
+    mysql_cond_signal(block->condvar);
+}
+
+
+/*
+  Wait until the last reader of the page in block
+  signals on its termination
+*/
+
+static void wait_for_readers(KEY_CACHE *keycache,
+                             BLOCK_LINK *block,
+                             st_keycache_thread_var *thread)
+{
+  DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+  DBUG_ASSERT(!(block->status & (BLOCK_IN_FLUSH | BLOCK_CHANGED)));
+  DBUG_ASSERT(block->hash_link);
+  DBUG_ASSERT(block->hash_link->block == block);
+  /* Linked in file_blocks or changed_blocks hash. */
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  /* Not linked in LRU ring. */
+  DBUG_ASSERT(!block->next_used);
+  DBUG_ASSERT(!block->prev_used);
+  while (block->hash_link->requests)
+  {
+    /* There must be no other waiter. We have no queue here. */
+    DBUG_ASSERT(!block->condvar);
+    block->condvar= &thread->suspend;
+    mysql_cond_wait(&thread->suspend, &keycache->cache_lock);
+    block->condvar= NULL;
+  }
+}
+
+
+/*
+  Add a hash link to a bucket in the hash_table
+*/
+
+static inline void link_hash(HASH_LINK **start, HASH_LINK *hash_link)
+{
+  if (*start)
+    (*start)->prev= &hash_link->next;
+  hash_link->next= *start;
+  hash_link->prev= start;
+  *start= hash_link;
+}
+
+
+/*
+  Remove a hash link from the hash table
+*/
+
+static void unlink_hash(KEY_CACHE *keycache, HASH_LINK *hash_link)
+{
+  DBUG_ASSERT(hash_link->requests == 0);
+  if ((*hash_link->prev= hash_link->next))
+    hash_link->next->prev= hash_link->prev;
+  hash_link->block= NULL;
+
+  if (keycache->waiting_for_hash_link.last_thread)
+  {
+    /* Signal that a free hash link has appeared */
+    st_keycache_thread_var *last_thread=
+                               keycache->waiting_for_hash_link.last_thread;
+    st_keycache_thread_var *first_thread= last_thread->next;
+    st_keycache_thread_var *next_thread= first_thread;
+    KEYCACHE_PAGE *first_page= (KEYCACHE_PAGE *) (first_thread->opt_info);
+    st_keycache_thread_var *thread;
+
+    hash_link->file= first_page->file;
+    hash_link->diskpos= first_page->filepos;
+    do
+    {
+      KEYCACHE_PAGE *page;
+      thread= next_thread;
+      page= (KEYCACHE_PAGE *) thread->opt_info;
+      next_thread= thread->next;
+      /*
+         We notify about the event all threads that ask
+         for the same page as the first thread in the queue
+      */
+      if (page->file == hash_link->file && page->filepos == hash_link->diskpos)
+      {
+        mysql_cond_signal(&thread->suspend);
+        unlink_from_queue(&keycache->waiting_for_hash_link, thread);
+      }
+    }
+    while (thread != last_thread);
+    link_hash(&keycache->hash_root[KEYCACHE_HASH(hash_link->file,
+					         hash_link->diskpos)],
+              hash_link);
+    return;
+  }
+  hash_link->next= keycache->free_hash_list;
+  keycache->free_hash_list= hash_link;
+}
+
+
+/*
+  Get the hash link for a page
+*/
+
+static HASH_LINK *get_hash_link(KEY_CACHE *keycache,
+                                int file, my_off_t filepos,
+                                st_keycache_thread_var *thread)
+{
+  HASH_LINK *hash_link, **start;
+#ifndef DBUG_OFF
+  int cnt;
+#endif
+
+restart:
+  /*
+     Find the bucket in the hash table for the pair (file, filepos);
+     start contains the head of the bucket list,
+     hash_link points to the first member of the list
+  */
+  hash_link= *(start= &keycache->hash_root[KEYCACHE_HASH(file, filepos)]);
+#ifndef DBUG_OFF
+  cnt= 0;
+#endif
+  /* Look for an element for the pair (file, filepos) in the bucket chain */
+  while (hash_link &&
+         (hash_link->diskpos != filepos || hash_link->file != file))
+  {
+    hash_link= hash_link->next;
+#ifndef DBUG_OFF
+    cnt++;
+    DBUG_ASSERT(cnt <= keycache->hash_links_used);
+#endif
+  }
+  if (! hash_link)
+  {
+    /* There is no hash link in the hash table for the pair (file, filepos) */
+    if (keycache->free_hash_list)
+    {
+      hash_link= keycache->free_hash_list;
+      keycache->free_hash_list= hash_link->next;
+    }
+    else if (keycache->hash_links_used < keycache->hash_links)
+    {
+      hash_link= &keycache->hash_link_root[keycache->hash_links_used++];
+    }
+    else
+    {
+      /* Wait for a free hash link */
+      KEYCACHE_PAGE page;
+      page.file= file;
+      page.filepos= filepos;
+      thread->opt_info= (void *) &page;
+      link_into_queue(&keycache->waiting_for_hash_link, thread);
+      mysql_cond_wait(&thread->suspend,
+                                 &keycache->cache_lock);
+      thread->opt_info= NULL;
+      goto restart;
+    }
+    hash_link->file= file;
+    hash_link->diskpos= filepos;
+    link_hash(start, hash_link);
+  }
+  /* Register the request for the page */
+  hash_link->requests++;
+
+  return hash_link;
+}
+
+
+/*
+  Get a block for the file page requested by a keycache read/write operation;
+  If the page is not in the cache return a free block, if there is none
+  return the lru block after saving its buffer if the page is dirty.
+
+  SYNOPSIS
+
+    find_key_block()
+      keycache            pointer to a key cache data structure
+      thread              pointer to thread specific variables
+      file                handler for the file to read page from
+      filepos             position of the page in the file
+      init_hits_left      how initialize the block counter for the page
+      wrmode              <-> get for writing
+      page_st        out  {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ}
+
+  RETURN VALUE
+    Pointer to the found block if successful, 0 - otherwise
+
+  NOTES.
+    For the page from file positioned at filepos the function checks whether
+    the page is in the key cache specified by the first parameter.
+    If this is the case it immediately returns the block.
+    If not, the function first chooses  a block for this page. If there is
+    no not used blocks in the key cache yet, the function takes the block
+    at the very beginning of the warm sub-chain. It saves the page in that
+    block if it's dirty before returning the pointer to it.
+    The function returns in the page_st parameter the following values:
+      PAGE_READ         - if page already in the block,
+      PAGE_TO_BE_READ   - if it is to be read yet by the current thread
+      WAIT_TO_BE_READ   - if it is to be read by another thread
+    If an error occurs THE BLOCK_ERROR bit is set in the block status.
+    It might happen that there are no blocks in LRU chain (in warm part) -
+    all blocks  are unlinked for some read/write operations. Then the function
+    waits until first of this operations links any block back.
+*/
+
+static BLOCK_LINK *find_key_block(KEY_CACHE *keycache,
+                                  st_keycache_thread_var *thread,
+                                  File file, my_off_t filepos,
+                                  int init_hits_left,
+                                  int wrmode, int *page_st)
+{
+  HASH_LINK *hash_link;
+  BLOCK_LINK *block;
+  int error= 0;
+  int page_status;
+
+  DBUG_ENTER("find_key_block");
+  DBUG_PRINT("enter", ("fd: %d  pos: %lu  wrmode: %d",
+                       file, (ulong) filepos, wrmode));
+
+restart:
+  /*
+    If the flush phase of a resize operation fails, the cache is left
+    unusable. This will be detected only after "goto restart".
+  */
+  if (!keycache->can_be_used)
+    DBUG_RETURN(0);
+
+  /*
+    Find the hash_link for the requested file block (file, filepos). We
+    do always get a hash_link here. It has registered our request so
+    that no other thread can use it for another file block until we
+    release the request (which is done by remove_reader() usually). The
+    hash_link can have a block assigned to it or not. If there is a
+    block, it may be assigned to this hash_link or not. In cases where a
+    block is evicted from the cache, it is taken from the LRU ring and
+    referenced by the new hash_link. But the block can still be assigned
+    to its old hash_link for some time if it needs to be flushed first,
+    or if there are other threads still reading it.
+
+    Summary:
+      hash_link is always returned.
+      hash_link->block can be:
+      - NULL or
+      - not assigned to this hash_link or
+      - assigned to this hash_link. If assigned, the block can have
+        - invalid data (when freshly assigned) or
+        - valid data. Valid data can be
+          - changed over the file contents (dirty) or
+          - not changed (clean).
+  */
+  hash_link= get_hash_link(keycache, file, filepos, thread);
+  DBUG_ASSERT((hash_link->file == file) && (hash_link->diskpos == filepos));
+
+  page_status= -1;
+  if ((block= hash_link->block) &&
+      block->hash_link == hash_link && (block->status & BLOCK_READ))
+  {
+    /* Assigned block with valid (changed or unchanged) contents. */
+    page_status= PAGE_READ;
+  }
+  /*
+    else (page_status == -1)
+      - block == NULL or
+      - block not assigned to this hash_link or
+      - block assigned but not yet read from file (invalid data).
+  */
+
+  if (keycache->in_resize)
+  {
+    /* This is a request during a resize operation */
+
+    if (!block)
+    {
+      /*
+        The file block is not in the cache. We don't need it in the
+        cache: we are going to read or write directly to file. Cancel
+        the request. We can simply decrement hash_link->requests because
+        we did not release cache_lock since increasing it. So no other
+        thread can wait for our request to become released.
+      */
+      if (hash_link->requests == 1)
+      {
+        /*
+          We are the only one to request this hash_link (this file/pos).
+          Free the hash_link.
+        */
+        hash_link->requests--;
+        unlink_hash(keycache, hash_link);
+        DBUG_RETURN(0);
+      }
+
+      /*
+        More requests on the hash_link. Someone tries to evict a block
+        for this hash_link (could have started before resizing started).
+        This means that the LRU ring is empty. Otherwise a block could
+        be assigned immediately. Behave like a thread that wants to
+        evict a block for this file/pos. Add to the queue of threads
+        waiting for a block. Wait until there is one assigned.
+
+        Refresh the request on the hash-link so that it cannot be reused
+        for another file/pos.
+      */
+      thread->opt_info= (void *) hash_link;
+      link_into_queue(&keycache->waiting_for_block, thread);
+      do
+      {
+        mysql_cond_wait(&thread->suspend,
+                                   &keycache->cache_lock);
+      } while (thread->next);
+      thread->opt_info= NULL;
+      /*
+        A block should now be assigned to the hash_link. But it may
+        still need to be evicted. Anyway, we should re-check the
+        situation. page_status must be set correctly.
+      */
+      hash_link->requests--;
+      goto restart;
+    } /* end of if (!block) */
+
+    /*
+      There is a block for this file/pos in the cache. Register a
+      request on it. This unlinks it from the LRU ring (if it is there)
+      and hence protects it against eviction (if not already in
+      eviction). We need this for returning the block to the caller, for
+      calling remove_reader() (for debugging purposes), and for calling
+      free_block(). The only case where we don't need the request is if
+      the block is in eviction. In that case we have to unregister the
+      request later.
+    */
+    reg_requests(keycache, block, 1);
+
+    if (page_status != PAGE_READ)
+    {
+      /*
+        - block not assigned to this hash_link or
+        - block assigned but not yet read from file (invalid data).
+
+        This must be a block in eviction. It will be read soon. We need
+        to wait here until this happened. Otherwise the caller could
+        access a wrong block or a block which is in read. While waiting
+        we cannot lose hash_link nor block. We have registered a request
+        on the hash_link. Everything can happen to the block but changes
+        in the hash_link -> block relationship. In other words:
+        everything can happen to the block but free or another completed
+        eviction.
+
+        Note that we bahave like a secondary requestor here. We just
+        cannot return with PAGE_WAIT_TO_BE_READ. This would work for
+        read requests and writes on dirty blocks that are not in flush
+        only. Waiting here on COND_FOR_REQUESTED works in all
+        situations.
+      */
+      DBUG_ASSERT(((block->hash_link != hash_link) &&
+                   (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) ||
+                  ((block->hash_link == hash_link) &&
+                   !(block->status & BLOCK_READ)));
+      wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock,
+                    thread);
+      /*
+        Here we can trust that the block has been assigned to this
+        hash_link (block->hash_link == hash_link) and read into the
+        buffer (BLOCK_READ). The worst things possible here are that the
+        block is in free (BLOCK_REASSIGNED). But the block is still
+        assigned to the hash_link. The freeing thread waits until we
+        release our request on the hash_link. The block must not be
+        again in eviction because we registered an request on it before
+        starting to wait.
+      */
+      DBUG_ASSERT(block->hash_link == hash_link);
+      DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+      DBUG_ASSERT(!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)));
+    }
+    /*
+      The block is in the cache. Assigned to the hash_link. Valid data.
+      Note that in case of page_st == PAGE_READ, the block can be marked
+      for eviction. In any case it can be marked for freeing.
+    */
+
+    if (!wrmode)
+    {
+      /* A reader can just read the block. */
+      *page_st= PAGE_READ;
+      DBUG_ASSERT((hash_link->file == file) &&
+                  (hash_link->diskpos == filepos) &&
+                  (block->hash_link == hash_link));
+      DBUG_RETURN(block);
+    }
+
+    /*
+      This is a writer. No two writers for the same block can exist.
+      This must be assured by locks outside of the key cache.
+    */
+    DBUG_ASSERT(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block));
+
+    while (block->status & BLOCK_IN_FLUSH)
+    {
+      /*
+        Wait until the block is flushed to file. Do not release the
+        request on the hash_link yet to prevent that the block is freed
+        or reassigned while we wait. While we wait, several things can
+        happen to the block, including another flush. But the block
+        cannot be reassigned to another hash_link until we release our
+        request on it. But it can be marked BLOCK_REASSIGNED from free
+        or eviction, while they wait for us to release the hash_link.
+      */
+      wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock,
+                    thread);
+      /*
+        If the flush phase failed, the resize could have finished while
+        we waited here.
+      */
+      if (!keycache->in_resize)
+      {
+        remove_reader(block);
+        unreg_request(keycache, block, 1);
+        goto restart;
+      }
+      DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+      DBUG_ASSERT(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block));
+      DBUG_ASSERT(block->hash_link == hash_link);
+    }
+
+    if (block->status & BLOCK_CHANGED)
+    {
+      /*
+        We want to write a block with changed contents. If the cache
+        block size is bigger than the callers block size (e.g. MyISAM),
+        the caller may replace part of the block only. Changes of the
+        other part of the block must be preserved. Since the block has
+        not yet been selected for flush, we can still add our changes.
+      */
+      *page_st= PAGE_READ;
+      DBUG_ASSERT((hash_link->file == file) &&
+                  (hash_link->diskpos == filepos) &&
+                  (block->hash_link == hash_link));
+      DBUG_RETURN(block);
+    }
+
+    /*
+      This is a write request for a clean block. We do not want to have
+      new dirty blocks in the cache while resizing. We will free the
+      block and write directly to file. If the block is in eviction or
+      in free, we just let it go.
+
+      Unregister from the hash_link. This must be done before freeing
+      the block. And it must be done if not freeing the block. Because
+      we could have waited above, we need to call remove_reader(). Other
+      threads could wait for us to release our request on the hash_link.
+    */
+    remove_reader(block);
+
+    /* If the block is not in eviction and not in free, we can free it. */
+    if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
+                           BLOCK_REASSIGNED)))
+    {
+      /*
+        Free block as we are going to write directly to file.
+        Although we have an exlusive lock for the updated key part,
+        the control can be yielded by the current thread as we might
+        have unfinished readers of other key parts in the block
+        buffer. Still we are guaranteed not to have any readers
+        of the key part we are writing into until the block is
+        removed from the cache as we set the BLOCK_REASSIGNED
+        flag (see the code below that handles reading requests).
+      */
+      free_block(keycache, thread, block);
+    }
+    else
+    {
+      /*
+        The block will be evicted/freed soon. Don't touch it in any way.
+        Unregister the request that we registered above.
+      */
+      unreg_request(keycache, block, 1);
+
+      /*
+        The block is still assigned to the hash_link (the file/pos that
+        we are going to write to). Wait until the eviction/free is
+        complete. Otherwise the direct write could complete before all
+        readers are done with the block. So they could read outdated
+        data.
+
+        Since we released our request on the hash_link, it can be reused
+        for another file/pos. Hence we cannot just check for
+        block->hash_link == hash_link. As long as the resize is
+        proceeding the block cannot be reassigned to the same file/pos
+        again. So we can terminate the loop when the block is no longer
+        assigned to this file/pos.
+      */
+      do
+      {
+        wait_on_queue(&block->wqueue[COND_FOR_SAVED],
+                      &keycache->cache_lock, thread);
+        /*
+          If the flush phase failed, the resize could have finished
+          while we waited here.
+        */
+        if (!keycache->in_resize)
+          goto restart;
+      } while (block->hash_link &&
+               (block->hash_link->file == file) &&
+               (block->hash_link->diskpos == filepos));
+    }
+    DBUG_RETURN(0);
+  }
+
+  if (page_status == PAGE_READ &&
+      (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
+                        BLOCK_REASSIGNED)))
+  {
+    /*
+      This is a request for a block to be removed from cache. The block
+      is assigned to this hash_link and contains valid data, but is
+      marked for eviction or to be freed. Possible reasons why it has
+      not yet been evicted/freed can be a flush before reassignment
+      (BLOCK_IN_SWITCH), readers of the block have not finished yet
+      (BLOCK_REASSIGNED), or the evicting thread did not yet awake after
+      the block has been selected for it (BLOCK_IN_EVICTION).
+    */
+
+    /*
+       Only reading requests can proceed until the old dirty page is flushed,
+       all others are to be suspended, then resubmitted
+    */
+    if (!wrmode && !(block->status & BLOCK_REASSIGNED))
+    {
+      /*
+        This is a read request and the block not yet reassigned. We can
+        register our request and proceed. This unlinks the block from
+        the LRU ring and protects it against eviction.
+      */
+      reg_requests(keycache, block, 1);
+    }
+    else
+    {
+      /*
+        Either this is a write request for a block that is in eviction
+        or in free. We must not use it any more. Instead we must evict
+        another block. But we cannot do this before the eviction/free is
+        done. Otherwise we would find the same hash_link + block again
+        and again.
+
+        Or this is a read request for a block in eviction/free that does
+        not require a flush, but waits for readers to finish with the
+        block. We do not read this block to let the eviction/free happen
+        as soon as possible. Again we must wait so that we don't find
+        the same hash_link + block again and again.
+      */
+      DBUG_ASSERT(hash_link->requests);
+      hash_link->requests--;
+      wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock,
+                    thread);
+      /*
+        The block is no longer assigned to this hash_link.
+        Get another one.
+      */
+      goto restart;
+    }
+  }
+  else
+  {
+    /*
+      This is a request for a new block or for a block not to be removed.
+      Either
+      - block == NULL or
+      - block not assigned to this hash_link or
+      - block assigned but not yet read from file,
+      or
+      - block assigned with valid (changed or unchanged) data and
+      - it will not be reassigned/freed.
+    */
+    if (! block)
+    {
+      /* No block is assigned to the hash_link yet. */
+      if (keycache->blocks_unused)
+      {
+        if (keycache->free_block_list)
+        {
+          /* There is a block in the free list. */
+          block= keycache->free_block_list;
+          keycache->free_block_list= block->next_used;
+          block->next_used= NULL;
+        }
+        else
+        {
+          size_t block_mem_offset;
+          /* There are some never used blocks, take first of them */
+          DBUG_ASSERT(keycache->blocks_used <
+                      (ulong) keycache->disk_blocks);
+          block= &keycache->block_root[keycache->blocks_used];
+          block_mem_offset= 
+           ((size_t) keycache->blocks_used) * keycache->key_cache_block_size;
+          block->buffer= ADD_TO_PTR(keycache->block_mem,
+                                    block_mem_offset,
+                                    uchar*);
+          keycache->blocks_used++;
+          DBUG_ASSERT(!block->next_used);
+        }
+        DBUG_ASSERT(!block->prev_used);
+        DBUG_ASSERT(!block->next_changed);
+        DBUG_ASSERT(!block->prev_changed);
+        DBUG_ASSERT(!block->hash_link);
+        DBUG_ASSERT(!block->status);
+        DBUG_ASSERT(!block->requests);
+        keycache->blocks_unused--;
+        block->status= BLOCK_IN_USE;
+        block->length= 0;
+        block->offset= keycache->key_cache_block_size;
+        block->requests= 1;
+        block->temperature= BLOCK_COLD;
+        block->hits_left= init_hits_left;
+        block->last_hit_time= 0;
+        block->hash_link= hash_link;
+        hash_link->block= block;
+        link_to_file_list(keycache, block, file, 0);
+        page_status= PAGE_TO_BE_READ;
+      }
+      else
+      {
+	/*
+          There are no free blocks and no never used blocks, use a block
+          from the LRU ring.
+        */
+
+        if (! keycache->used_last)
+        {
+          /*
+            The LRU ring is empty. Wait until a new block is added to
+            it. Several threads might wait here for the same hash_link,
+            all of them must get the same block. While waiting for a
+            block, after a block is selected for this hash_link, other
+            threads can run first before this one awakes. During this
+            time interval other threads find this hash_link pointing to
+            the block, which is still assigned to another hash_link. In
+            this case the block is not marked BLOCK_IN_SWITCH yet, but
+            it is marked BLOCK_IN_EVICTION.
+          */
+
+          thread->opt_info= (void *) hash_link;
+          link_into_queue(&keycache->waiting_for_block, thread);
+          do
+          {
+            mysql_cond_wait(&thread->suspend,
+                                       &keycache->cache_lock);
+          }
+          while (thread->next);
+          thread->opt_info= NULL;
+          /* Assert that block has a request registered. */
+          DBUG_ASSERT(hash_link->block->requests);
+          /* Assert that block is not in LRU ring. */
+          DBUG_ASSERT(!hash_link->block->next_used);
+          DBUG_ASSERT(!hash_link->block->prev_used);
+        }
+
+        /*
+          If we waited above, hash_link->block has been assigned by
+          link_block(). Otherwise it is still NULL. In the latter case
+          we need to grab a block from the LRU ring ourselves.
+        */
+        block= hash_link->block;
+        if (! block)
+        {
+          /* Select the last block from the LRU ring. */
+          block= keycache->used_last->next_used;
+          block->hits_left= init_hits_left;
+          block->last_hit_time= 0;
+          hash_link->block= block;
+          /*
+            Register a request on the block. This unlinks it from the
+            LRU ring and protects it against eviction.
+          */
+          DBUG_ASSERT(!block->requests);
+          reg_requests(keycache, block,1);
+          /*
+            We do not need to set block->status|= BLOCK_IN_EVICTION here
+            because we will set block->status|= BLOCK_IN_SWITCH
+            immediately without releasing the lock in between. This does
+            also support debugging. When looking at the block, one can
+            see if the block has been selected by link_block() after the
+            LRU ring was empty, or if it was grabbed directly from the
+            LRU ring in this branch.
+          */
+        }
+
+        /*
+          If we had to wait above, there is a small chance that another
+          thread grabbed this block for the same file block already. But
+          in most cases the first condition is true.
+        */
+        if (block->hash_link != hash_link &&
+	    ! (block->status & BLOCK_IN_SWITCH) )
+        {
+	  /* this is a primary request for a new page */
+          block->status|= BLOCK_IN_SWITCH;
+
+          if (block->status & BLOCK_CHANGED)
+          {
+	    /* The block contains a dirty page - push it out of the cache */
+
+            if (block->status & BLOCK_IN_FLUSH)
+            {
+              /*
+                The block is marked for flush. If we do not wait here,
+                it could happen that we write the block, reassign it to
+                another file block, then, before the new owner can read
+                the new file block, the flusher writes the cache block
+                (which still has the old contents) to the new file block!
+              */
+              wait_on_queue(&block->wqueue[COND_FOR_SAVED],
+                            &keycache->cache_lock, thread);
+              /*
+                The block is marked BLOCK_IN_SWITCH. It should be left
+                alone except for reading. No free, no write.
+              */
+              DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+              DBUG_ASSERT(!(block->status & (BLOCK_REASSIGNED |
+                                             BLOCK_CHANGED |
+                                             BLOCK_FOR_UPDATE)));
+            }
+            else
+            {
+              block->status|= BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE;
+              /*
+                BLOCK_IN_EVICTION may be true or not. Other flags must
+                have a fixed value.
+              */
+              DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) ==
+                          (BLOCK_READ | BLOCK_IN_SWITCH |
+                           BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE |
+                           BLOCK_CHANGED | BLOCK_IN_USE));
+              DBUG_ASSERT(block->hash_link);
+
+              mysql_mutex_unlock(&keycache->cache_lock);
+              /*
+                The call is thread safe because only the current
+                thread might change the block->hash_link value
+              */
+              error= (int)my_pwrite(block->hash_link->file,
+                                    block->buffer + block->offset,
+                                    block->length - block->offset,
+                                    block->hash_link->diskpos + block->offset,
+                                    MYF(MY_NABP | MY_WAIT_IF_FULL));
+              mysql_mutex_lock(&keycache->cache_lock);
+
+              /* Block status must not have changed. */
+              DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) ==
+                          (BLOCK_READ | BLOCK_IN_SWITCH |
+                           BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE |
+                           BLOCK_CHANGED | BLOCK_IN_USE) || fail_block(block));
+              keycache->global_cache_write++;
+            }
+          }
+
+          block->status|= BLOCK_REASSIGNED;
+          /*
+            The block comes from the LRU ring. It must have a hash_link
+            assigned.
+          */
+          DBUG_ASSERT(block->hash_link);
+          if (block->hash_link)
+          {
+            /*
+              All pending requests for this page must be resubmitted.
+              This must be done before waiting for readers. They could
+              wait for the flush to complete. And we must also do it
+              after the wait. Flushers might try to free the block while
+              we wait. They would wait until the reassignment is
+              complete. Also the block status must reflect the correct
+              situation: The block is not changed nor in flush any more.
+              Note that we must not change the BLOCK_CHANGED flag
+              outside of link_to_file_list() so that it is always in the
+              correct queue and the *blocks_changed counters are
+              correct.
+            */
+            block->status&= ~(BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE);
+            link_to_file_list(keycache, block, block->hash_link->file, 1);
+            release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
+            /*
+              The block is still assigned to its old hash_link.
+	      Wait until all pending read requests
+	      for this page are executed
+	      (we could have avoided this waiting, if we had read
+	      a page in the cache in a sweep, without yielding control)
+            */
+            wait_for_readers(keycache, block, thread);
+            DBUG_ASSERT(block->hash_link && block->hash_link->block == block &&
+                        block->prev_changed);
+            /* The reader must not have been a writer. */
+            DBUG_ASSERT(!(block->status & BLOCK_CHANGED));
+
+            /* Wake flushers that might have found the block in between. */
+            release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
+
+            /* Remove the hash link for the old file block from the hash. */
+            unlink_hash(keycache, block->hash_link);
+
+            /*
+              For sanity checks link_to_file_list() asserts that block
+              and hash_link refer to each other. Hence we need to assign
+              the hash_link first, but then we would not know if it was
+              linked before. Hence we would not know if to unlink it. So
+              unlink it here and call link_to_file_list(..., FALSE).
+            */
+            unlink_changed(block);
+          }
+          block->status= error ? BLOCK_ERROR : BLOCK_IN_USE ;
+          block->length= 0;
+          block->offset= keycache->key_cache_block_size;
+          block->hash_link= hash_link;
+          link_to_file_list(keycache, block, file, 0);
+          page_status= PAGE_TO_BE_READ;
+
+          DBUG_ASSERT(block->hash_link->block == block);
+          DBUG_ASSERT(hash_link->block->hash_link == hash_link);
+        }
+        else
+        {
+          /*
+            Either (block->hash_link == hash_link),
+	    or     (block->status & BLOCK_IN_SWITCH).
+
+            This is for secondary requests for a new file block only.
+            Either it is already assigned to the new hash_link meanwhile
+            (if we had to wait due to empty LRU), or it is already in
+            eviction by another thread. Since this block has been
+            grabbed from the LRU ring and attached to this hash_link,
+            another thread cannot grab the same block from the LRU ring
+            anymore. If the block is in eviction already, it must become
+            attached to the same hash_link and as such destined for the
+            same file block.
+          */
+          page_status= (((block->hash_link == hash_link) &&
+                         (block->status & BLOCK_READ)) ?
+                        PAGE_READ : PAGE_WAIT_TO_BE_READ);
+        }
+      }
+    }
+    else
+    {
+      /*
+        Block is not NULL. This hash_link points to a block.
+        Either
+        - block not assigned to this hash_link (yet) or
+        - block assigned but not yet read from file,
+        or
+        - block assigned with valid (changed or unchanged) data and
+        - it will not be reassigned/freed.
+
+        The first condition means hash_link points to a block in
+        eviction. This is not necessarily marked by BLOCK_IN_SWITCH yet.
+        But then it is marked BLOCK_IN_EVICTION. See the NOTE in
+        link_block(). In both cases it is destined for this hash_link
+        and its file block address. When this hash_link got its block
+        address, the block was removed from the LRU ring and cannot be
+        selected for eviction (for another hash_link) again.
+
+        Register a request on the block. This is another protection
+        against eviction.
+      */
+      DBUG_ASSERT(((block->hash_link != hash_link) &&
+                   (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) ||
+                  ((block->hash_link == hash_link) &&
+                   !(block->status & BLOCK_READ)) ||
+                  ((block->status & BLOCK_READ) &&
+                   !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))));
+      reg_requests(keycache, block, 1);
+      page_status= (((block->hash_link == hash_link) &&
+                     (block->status & BLOCK_READ)) ?
+                    PAGE_READ : PAGE_WAIT_TO_BE_READ);
+    }
+  }
+
+  DBUG_ASSERT(page_status != -1);
+  /* Same assert basically, but be very sure. */
+  DBUG_ASSERT(block);
+  /* Assert that block has a request and is not in LRU ring. */
+  DBUG_ASSERT(block->requests);
+  DBUG_ASSERT(!block->next_used);
+  DBUG_ASSERT(!block->prev_used);
+  /* Assert that we return the correct block. */
+  DBUG_ASSERT((page_status == PAGE_WAIT_TO_BE_READ) ||
+              ((block->hash_link->file == file) &&
+               (block->hash_link->diskpos == filepos)));
+  *page_st=page_status;
+  DBUG_RETURN(block);
+}
+
+
+/*
+  Read into a key cache block buffer from disk.
+
+  SYNOPSIS
+
+    read_block()
+      keycache            pointer to a key cache data structure
+      thread_var          pointer to thread specific variables
+      block               block to which buffer the data is to be read
+      read_length         size of data to be read
+      min_length          at least so much data must be read
+      primary             <-> the current thread will read the data
+
+  RETURN VALUE
+    None
+
+  NOTES.
+    The function either reads a page data from file to the block buffer,
+    or waits until another thread reads it. What page to read is determined
+    by a block parameter - reference to a hash link for this page.
+    If an error occurs THE BLOCK_ERROR bit is set in the block status.
+    We do not report error when the size of successfully read
+    portion is less than read_length, but not less than min_length.
+*/
+
+static void read_block(KEY_CACHE *keycache,
+                       st_keycache_thread_var *thread_var,
+                       BLOCK_LINK *block, uint read_length,
+                       uint min_length, my_bool primary)
+{
+  size_t got_length;
+
+  /* On entry cache_lock is locked */
+
+  if (primary)
+  {
+    /*
+      This code is executed only by threads that submitted primary
+      requests. Until block->status contains BLOCK_READ, all other
+      request for the block become secondary requests. For a primary
+      request the block must be properly initialized.
+    */
+    DBUG_ASSERT(((block->status & ~BLOCK_FOR_UPDATE) == BLOCK_IN_USE) ||
+                fail_block(block));
+    DBUG_ASSERT((block->length == 0) || fail_block(block));
+    DBUG_ASSERT((block->offset == keycache->key_cache_block_size) ||
+                fail_block(block));
+    DBUG_ASSERT((block->requests > 0) || fail_block(block));
+
+    keycache->global_cache_read++;
+    /* Page is not in buffer yet, is to be read from disk */
+    mysql_mutex_unlock(&keycache->cache_lock);
+    /*
+      Here other threads may step in and register as secondary readers.
+      They will register in block->wqueue[COND_FOR_REQUESTED].
+    */
+    got_length= my_pread(block->hash_link->file, block->buffer,
+                         read_length, block->hash_link->diskpos, MYF(0));
+    mysql_mutex_lock(&keycache->cache_lock);
+    /*
+      The block can now have been marked for free (in case of
+      FLUSH_RELEASE). Otherwise the state must be unchanged.
+    */
+    DBUG_ASSERT(((block->status & ~(BLOCK_REASSIGNED |
+                                    BLOCK_FOR_UPDATE)) == BLOCK_IN_USE) ||
+                fail_block(block));
+    DBUG_ASSERT((block->length == 0) || fail_block(block));
+    DBUG_ASSERT((block->offset == keycache->key_cache_block_size) ||
+                fail_block(block));
+    DBUG_ASSERT((block->requests > 0) || fail_block(block));
+
+    if (got_length < min_length)
+      block->status|= BLOCK_ERROR;
+    else
+    {
+      block->status|= BLOCK_READ;
+      block->length= (int)got_length;
+      /*
+        Do not set block->offset here. If this block is marked
+        BLOCK_CHANGED later, we want to flush only the modified part. So
+        only a writer may set block->offset down from
+        keycache->key_cache_block_size.
+      */
+    }
+    /* Signal that all pending requests for this page now can be processed */
+    release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
+  }
+  else
+  {
+    /*
+      This code is executed only by threads that submitted secondary
+      requests. At this point it could happen that the cache block is
+      not yet assigned to the hash_link for the requested file block.
+      But at awake from the wait this should be the case. Unfortunately
+      we cannot assert this here because we do not know the hash_link
+      for the requested file block nor the file and position. So we have
+      to assert this in the caller.
+    */
+    wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock,
+                  thread_var);
+  }
+}
+
+
+/*
+  Read a block of data from a cached file into a buffer;
+
+  SYNOPSIS
+
+    key_cache_read()
+      keycache            pointer to a key cache data structure
+      thread_var          pointer to thread specific variables
+      file                handler for the file for the block of data to be read
+      filepos             position of the block of data in the file
+      level               determines the weight of the data
+      buff                buffer to where the data must be placed
+      length              length of the buffer
+      block_length        length of the block in the key cache buffer
+      return_buffer       return pointer to the key cache buffer with the data
+
+  RETURN VALUE
+    Returns address from where the data is placed if sucessful, 0 - otherwise.
+
+  NOTES.
+    The function ensures that a block of data of size length from file
+    positioned at filepos is in the buffers for some key cache blocks.
+    Then the function either copies the data into the buffer buff, or,
+    if return_buffer is TRUE, it just returns the pointer to the key cache
+    buffer with the data.
+    Filepos must be a multiple of 'block_length', but it doesn't
+    have to be a multiple of key_cache_block_size;
+*/
+
+uchar *key_cache_read(KEY_CACHE *keycache,
+                      st_keycache_thread_var *thread_var,
+                      File file, my_off_t filepos, int level,
+                      uchar *buff, uint length,
+                      uint block_length MY_ATTRIBUTE((unused)),
+                      int return_buffer MY_ATTRIBUTE((unused)))
+{
+  my_bool locked_and_incremented= FALSE;
+  int error=0;
+  uchar *start= buff;
+  DBUG_ENTER("key_cache_read");
+  DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
+               (uint) file, (ulong) filepos, length));
+
+  if (keycache->key_cache_inited)
+  {
+    /* Key cache is used */
+    BLOCK_LINK *block;
+    uint read_length;
+    uint offset;
+    int page_st;
+
+    if (MYSQL_KEYCACHE_READ_START_ENABLED())
+    {
+      MYSQL_KEYCACHE_READ_START(my_filename(file), length,
+                                (ulong) (keycache->blocks_used *
+                                         keycache->key_cache_block_size),
+                                (ulong) (keycache->blocks_unused *
+                                         keycache->key_cache_block_size));
+    }
+  
+    /*
+      When the key cache is once initialized, we use the cache_lock to
+      reliably distinguish the cases of normal operation, resizing, and
+      disabled cache. We always increment and decrement
+      'cnt_for_resize_op' so that a resizer can wait for pending I/O.
+    */
+    mysql_mutex_lock(&keycache->cache_lock);
+    /*
+      Cache resizing has two phases: Flushing and re-initializing. In
+      the flush phase read requests are allowed to bypass the cache for
+      blocks not in the cache. find_key_block() returns NULL in this
+      case.
+
+      After the flush phase new I/O requests must wait until the
+      re-initialization is done. The re-initialization can be done only
+      if no I/O request is in progress. The reason is that
+      key_cache_block_size can change. With enabled cache, I/O is done
+      in chunks of key_cache_block_size. Every chunk tries to use a
+      cache block first. If the block size changes in the middle, a
+      block could be missed and old data could be read.
+    */
+    while (keycache->in_resize && !keycache->resize_in_flush)
+      wait_on_queue(&keycache->resize_queue, &keycache->cache_lock,
+                    thread_var);
+    /* Register the I/O for the next resize. */
+    inc_counter_for_resize_op(keycache);
+    locked_and_incremented= TRUE;
+    /* Requested data may not always be aligned to cache blocks. */
+    offset= (uint) (filepos % keycache->key_cache_block_size);
+    /* Read data in key_cache_block_size increments */
+    do
+    {
+      /* Cache could be disabled in a later iteration. */
+      if (!keycache->can_be_used)
+      {
+        goto no_key_cache;
+      }
+      /* Start reading at the beginning of the cache block. */
+      filepos-= offset;
+      /* Do not read beyond the end of the cache block. */
+      read_length= length;
+      set_if_smaller(read_length, keycache->key_cache_block_size-offset);
+      DBUG_ASSERT(read_length > 0);
+
+      if (block_length > keycache->key_cache_block_size || offset)
+	return_buffer=0;
+
+      /* Request the cache block that matches file/pos. */
+      keycache->global_cache_r_requests++;
+
+      MYSQL_KEYCACHE_READ_BLOCK(keycache->key_cache_block_size);
+
+      block= find_key_block(keycache, thread_var, file, filepos, level, 0,
+                            &page_st);
+      if (!block)
+      {
+        /*
+          This happens only for requests submitted during key cache
+          resize. The block is not in the cache and shall not go in.
+          Read directly from file.
+        */
+        keycache->global_cache_read++;
+        mysql_mutex_unlock(&keycache->cache_lock);
+        error= (my_pread(file, (uchar*) buff, read_length,
+                         filepos + offset, MYF(MY_NABP)) != 0);
+        mysql_mutex_lock(&keycache->cache_lock);
+        goto next_block;
+      }
+      if (!(block->status & BLOCK_ERROR))
+      {
+        if (page_st != PAGE_READ)
+        {
+          MYSQL_KEYCACHE_READ_MISS();
+          /* The requested page is to be read into the block buffer */
+          read_block(keycache, thread_var, block,
+                     keycache->key_cache_block_size, read_length+offset,
+                     (my_bool)(page_st == PAGE_TO_BE_READ));
+          /*
+            A secondary request must now have the block assigned to the
+            requested file block. It does not hurt to check it for
+            primary requests too.
+          */
+          DBUG_ASSERT(keycache->can_be_used);
+          DBUG_ASSERT(block->hash_link->file == file);
+          DBUG_ASSERT(block->hash_link->diskpos == filepos);
+          DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+        }
+        else if (block->length < read_length + offset)
+        {
+          /*
+            Impossible if nothing goes wrong:
+            this could only happen if we are using a file with
+            small key blocks and are trying to read outside the file
+          */
+          set_my_errno(-1);
+          block->status|= BLOCK_ERROR;
+        }
+        else
+        {
+          MYSQL_KEYCACHE_READ_HIT();
+        }
+      }
+
+      /* block status may have added BLOCK_ERROR in the above 'if'. */
+      if (!(block->status & BLOCK_ERROR))
+      {
+        {
+          DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+          mysql_mutex_unlock(&keycache->cache_lock);
+
+          /* Copy data from the cache buffer */
+          memcpy(buff, block->buffer+offset, (size_t) read_length);
+
+          mysql_mutex_lock(&keycache->cache_lock);
+          DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+        }
+      }
+
+      remove_reader(block);
+
+      /* Error injection for coverage testing. */
+      DBUG_EXECUTE_IF("key_cache_read_block_error",
+                      block->status|= BLOCK_ERROR;);
+
+      /* Do not link erroneous blocks into the LRU ring, but free them. */
+      if (!(block->status & BLOCK_ERROR))
+      {
+        /*
+          Link the block into the LRU ring if it's the last submitted
+          request for the block. This enables eviction for the block.
+        */
+        unreg_request(keycache, block, 1);
+      }
+      else
+      {
+        free_block(keycache, thread_var, block);
+        error= 1;
+        break;
+      }
+
+    next_block:
+      buff+= read_length;
+      filepos+= read_length+offset;
+      offset= 0;
+
+    } while ((length-= read_length));
+    if (MYSQL_KEYCACHE_READ_DONE_ENABLED())
+    {
+      MYSQL_KEYCACHE_READ_DONE((ulong) (keycache->blocks_used *
+                                        keycache->key_cache_block_size),
+                               (ulong) (keycache->blocks_unused *
+                                        keycache->key_cache_block_size));
+    }
+    goto end;
+  }
+
+no_key_cache:
+  /* Key cache is not used */
+
+  keycache->global_cache_r_requests++;
+  keycache->global_cache_read++;
+
+  if (locked_and_incremented)
+    mysql_mutex_unlock(&keycache->cache_lock);
+  if (my_pread(file, (uchar*) buff, length, filepos, MYF(MY_NABP)))
+    error= 1;
+  if (locked_and_incremented)
+    mysql_mutex_lock(&keycache->cache_lock);
+
+end:
+  if (locked_and_incremented)
+  {
+    dec_counter_for_resize_op(keycache);
+    mysql_mutex_unlock(&keycache->cache_lock);
+  }
+  DBUG_PRINT("exit", ("error: %d", error ));
+  DBUG_RETURN(error ? (uchar*) 0 : start);
+}
+
+
+/*
+  Insert a block of file data from a buffer into key cache
+
+  SYNOPSIS
+    key_cache_insert()
+    keycache            pointer to a key cache data structure
+    thread_var          pointer to thread specific variables
+    file                handler for the file to insert data from
+    filepos             position of the block of data in the file to insert
+    level               determines the weight of the data
+    buff                buffer to read data from
+    length              length of the data in the buffer
+
+  NOTES
+    This is used by MyISAM to move all blocks from a index file to the key
+    cache
+
+  RETURN VALUE
+    0 if a success, 1 - otherwise.
+*/
+
+int key_cache_insert(KEY_CACHE *keycache,
+                     st_keycache_thread_var *thread_var,
+                     File file, my_off_t filepos, int level,
+                     uchar *buff, uint length)
+{
+  int error= 0;
+  DBUG_ENTER("key_cache_insert");
+  DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
+               (uint) file,(ulong) filepos, length));
+
+  if (keycache->key_cache_inited)
+  {
+    /* Key cache is used */
+    BLOCK_LINK *block;
+    uint read_length;
+    uint offset;
+    int page_st;
+    my_bool locked_and_incremented= FALSE;
+
+    /*
+      When the keycache is once initialized, we use the cache_lock to
+      reliably distinguish the cases of normal operation, resizing, and
+      disabled cache. We always increment and decrement
+      'cnt_for_resize_op' so that a resizer can wait for pending I/O.
+    */
+    mysql_mutex_lock(&keycache->cache_lock);
+    /*
+      We do not load index data into a disabled cache nor into an
+      ongoing resize.
+    */
+    if (!keycache->can_be_used || keycache->in_resize)
+	goto no_key_cache;
+    /* Register the pseudo I/O for the next resize. */
+    inc_counter_for_resize_op(keycache);
+    locked_and_incremented= TRUE;
+    /* Loaded data may not always be aligned to cache blocks. */
+    offset= (uint) (filepos % keycache->key_cache_block_size);
+    /* Load data in key_cache_block_size increments. */
+    do
+    {
+      /* Cache could be disabled or resizing in a later iteration. */
+      if (!keycache->can_be_used || keycache->in_resize)
+	goto no_key_cache;
+      /* Start loading at the beginning of the cache block. */
+      filepos-= offset;
+      /* Do not load beyond the end of the cache block. */
+      read_length= length;
+      set_if_smaller(read_length, keycache->key_cache_block_size-offset);
+      DBUG_ASSERT(read_length > 0);
+
+      /* The block has been read by the caller already. */
+      keycache->global_cache_read++;
+      /* Request the cache block that matches file/pos. */
+      keycache->global_cache_r_requests++;
+      block= find_key_block(keycache, thread_var, file, filepos, level, 0,
+                            &page_st);
+      if (!block)
+      {
+        /*
+          This happens only for requests submitted during key cache
+          resize. The block is not in the cache and shall not go in.
+          Stop loading index data.
+        */
+        goto no_key_cache;
+      }
+      if (!(block->status & BLOCK_ERROR))
+      {
+        if ((page_st == PAGE_WAIT_TO_BE_READ) ||
+            ((page_st == PAGE_TO_BE_READ) &&
+             (offset || (read_length < keycache->key_cache_block_size))))
+        {
+          /*
+            Either
+
+            this is a secondary request for a block to be read into the
+            cache. The block is in eviction. It is not yet assigned to
+            the requested file block (It does not point to the right
+            hash_link). So we cannot call remove_reader() on the block.
+            And we cannot access the hash_link directly here. We need to
+            wait until the assignment is complete. read_block() executes
+            the correct wait when called with primary == FALSE.
+
+            Or
+
+            this is a primary request for a block to be read into the
+            cache and the supplied data does not fill the whole block.
+
+            This function is called on behalf of a LOAD INDEX INTO CACHE
+            statement, which is a read-only task and allows other
+            readers. It is possible that a parallel running reader tries
+            to access this block. If it needs more data than has been
+            supplied here, it would report an error. To be sure that we
+            have all data in the block that is available in the file, we
+            read the block ourselves.
+
+            Though reading again what the caller did read already is an
+            expensive operation, we need to do this for correctness.
+          */
+          read_block(keycache, thread_var, block,
+                     keycache->key_cache_block_size,
+                     read_length + offset, (page_st == PAGE_TO_BE_READ));
+          /*
+            A secondary request must now have the block assigned to the
+            requested file block. It does not hurt to check it for
+            primary requests too.
+          */
+          DBUG_ASSERT(keycache->can_be_used);
+          DBUG_ASSERT(block->hash_link->file == file);
+          DBUG_ASSERT(block->hash_link->diskpos == filepos);
+          DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+        }
+        else if (page_st == PAGE_TO_BE_READ)
+        {
+          /*
+            This is a new block in the cache. If we come here, we have
+            data for the whole block.
+          */
+          DBUG_ASSERT(block->hash_link->requests);
+          DBUG_ASSERT(block->status & BLOCK_IN_USE);
+          DBUG_ASSERT((page_st == PAGE_TO_BE_READ) ||
+                      (block->status & BLOCK_READ));
+
+          mysql_mutex_unlock(&keycache->cache_lock);
+          /*
+            Here other threads may step in and register as secondary readers.
+            They will register in block->wqueue[COND_FOR_REQUESTED].
+          */
+
+          /* Copy data from buff */
+          memcpy(block->buffer+offset, buff, (size_t) read_length);
+
+          mysql_mutex_lock(&keycache->cache_lock);
+          DBUG_ASSERT(block->status & BLOCK_IN_USE);
+          DBUG_ASSERT((page_st == PAGE_TO_BE_READ) ||
+                      (block->status & BLOCK_READ));
+          /*
+            After the data is in the buffer, we can declare the block
+            valid. Now other threads do not need to register as
+            secondary readers any more. They can immediately access the
+            block.
+          */
+          block->status|= BLOCK_READ;
+          block->length= read_length+offset;
+          /*
+            Do not set block->offset here. If this block is marked
+            BLOCK_CHANGED later, we want to flush only the modified part. So
+            only a writer may set block->offset down from
+            keycache->key_cache_block_size.
+          */
+          /* Signal all pending requests. */
+          release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
+        }
+        else
+        {
+          /*
+            page_st == PAGE_READ. The block is in the buffer. All data
+            must already be present. Blocks are always read with all
+            data available on file. Assert that the block does not have
+            less contents than the preloader supplies. If the caller has
+            data beyond block->length, it means that a file write has
+            been done while this block was in cache and not extended
+            with the new data. If the condition is met, we can simply
+            ignore the block.
+          */
+          DBUG_ASSERT((page_st == PAGE_READ) &&
+                      (read_length + offset <= block->length));
+        }
+
+        /*
+          A secondary request must now have the block assigned to the
+          requested file block. It does not hurt to check it for primary
+          requests too.
+        */
+        DBUG_ASSERT(block->hash_link->file == file);
+        DBUG_ASSERT(block->hash_link->diskpos == filepos);
+        DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+      } /* end of if (!(block->status & BLOCK_ERROR)) */
+
+      remove_reader(block);
+
+      /* Error injection for coverage testing. */
+      DBUG_EXECUTE_IF("key_cache_insert_block_error",
+                      block->status|= BLOCK_ERROR; errno=EIO;);
+
+      /* Do not link erroneous blocks into the LRU ring, but free them. */
+      if (!(block->status & BLOCK_ERROR))
+      {
+        /*
+          Link the block into the LRU ring if it's the last submitted
+          request for the block. This enables eviction for the block.
+        */
+        unreg_request(keycache, block, 1);
+      }
+      else
+      {
+        free_block(keycache, thread_var, block);
+        error= 1;
+        break;
+      }
+
+      buff+= read_length;
+      filepos+= read_length+offset;
+      offset= 0;
+
+    } while ((length-= read_length));
+
+  no_key_cache:
+    if (locked_and_incremented)
+      dec_counter_for_resize_op(keycache);
+    mysql_mutex_unlock(&keycache->cache_lock);
+  }
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Write a buffer into a cached file.
+
+  SYNOPSIS
+
+    key_cache_write()
+      keycache            pointer to a key cache data structure
+      thread_var          pointer to thread specific variables
+      file                handler for the file to write data to
+      filepos             position in the file to write data to
+      level               determines the weight of the data
+      buff                buffer with the data
+      length              length of the buffer
+      dont_write          if is 0 then all dirty pages involved in writing
+                          should have been flushed from key cache
+
+  RETURN VALUE
+    0 if a success, 1 - otherwise.
+
+  NOTES.
+    The function copies the data of size length from buff into buffers
+    for key cache blocks that are  assigned to contain the portion of
+    the file starting with position filepos.
+    It ensures that this data is flushed to the file if dont_write is FALSE.
+    Filepos must be a multiple of 'block_length', but it doesn't
+    have to be a multiple of key_cache_block_size;
+
+    dont_write is always TRUE in the server (info->lock_type is never F_UNLCK).
+*/
+
+int key_cache_write(KEY_CACHE *keycache,
+                    st_keycache_thread_var *thread_var,
+                    File file, my_off_t filepos, int level,
+                    uchar *buff, uint length,
+                    uint block_length  MY_ATTRIBUTE((unused)),
+                    int dont_write)
+{
+  my_bool locked_and_incremented= FALSE;
+  int error=0;
+  DBUG_ENTER("key_cache_write");
+  DBUG_PRINT("enter",
+             ("fd: %u  pos: %lu  length: %u  block_length: %u"
+              "  key_block_length: %u",
+              (uint) file, (ulong) filepos, length, block_length,
+              keycache ? keycache->key_cache_block_size : 0));
+
+  if (!dont_write)
+  {
+    /* purecov: begin inspected */
+    /* Not used in the server. */
+    /* Force writing from buff into disk. */
+    keycache->global_cache_w_requests++;
+    keycache->global_cache_write++;
+    if (my_pwrite(file, buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL)))
+      DBUG_RETURN(1);
+    /* purecov: end */
+  }
+
+  if (keycache->key_cache_inited)
+  {
+    /* Key cache is used */
+    BLOCK_LINK *block;
+    uint read_length;
+    uint offset;
+    int page_st;
+
+    if (MYSQL_KEYCACHE_WRITE_START_ENABLED())
+    {
+      MYSQL_KEYCACHE_WRITE_START(my_filename(file), length,
+                                 (ulong) (keycache->blocks_used *
+                                          keycache->key_cache_block_size),
+                                 (ulong) (keycache->blocks_unused *
+                                          keycache->key_cache_block_size));
+    }
+
+    /*
+      When the key cache is once initialized, we use the cache_lock to
+      reliably distinguish the cases of normal operation, resizing, and
+      disabled cache. We always increment and decrement
+      'cnt_for_resize_op' so that a resizer can wait for pending I/O.
+    */
+    mysql_mutex_lock(&keycache->cache_lock);
+    /*
+      Cache resizing has two phases: Flushing and re-initializing. In
+      the flush phase write requests can modify dirty blocks that are
+      not yet in flush. Otherwise they are allowed to bypass the cache.
+      find_key_block() returns NULL in both cases (clean blocks and
+      non-cached blocks).
+
+      After the flush phase new I/O requests must wait until the
+      re-initialization is done. The re-initialization can be done only
+      if no I/O request is in progress. The reason is that
+      key_cache_block_size can change. With enabled cache I/O is done in
+      chunks of key_cache_block_size. Every chunk tries to use a cache
+      block first. If the block size changes in the middle, a block
+      could be missed and data could be written below a cached block.
+    */
+    while (keycache->in_resize && !keycache->resize_in_flush)
+      wait_on_queue(&keycache->resize_queue, &keycache->cache_lock,
+                    thread_var);
+    /* Register the I/O for the next resize. */
+    inc_counter_for_resize_op(keycache);
+    locked_and_incremented= TRUE;
+    /* Requested data may not always be aligned to cache blocks. */
+    offset= (uint) (filepos % keycache->key_cache_block_size);
+    /* Write data in key_cache_block_size increments. */
+    do
+    {
+      /* Cache could be disabled in a later iteration. */
+      if (!keycache->can_be_used)
+	goto no_key_cache;
+
+      MYSQL_KEYCACHE_WRITE_BLOCK(keycache->key_cache_block_size);
+      /* Start writing at the beginning of the cache block. */
+      filepos-= offset;
+      /* Do not write beyond the end of the cache block. */
+      read_length= length;
+      set_if_smaller(read_length, keycache->key_cache_block_size-offset);
+      DBUG_ASSERT(read_length > 0);
+
+      /* Request the cache block that matches file/pos. */
+      keycache->global_cache_w_requests++;
+      block= find_key_block(keycache, thread_var, file, filepos, level, 1,
+                            &page_st);
+      if (!block)
+      {
+        /*
+          This happens only for requests submitted during key cache
+          resize. The block is not in the cache and shall not go in.
+          Write directly to file.
+        */
+        if (dont_write)
+        {
+          /* Used in the server. */
+          keycache->global_cache_write++;
+          mysql_mutex_unlock(&keycache->cache_lock);
+          if (my_pwrite(file, (uchar*) buff, read_length, filepos + offset,
+                        MYF(MY_NABP | MY_WAIT_IF_FULL)))
+            error=1;
+          mysql_mutex_lock(&keycache->cache_lock);
+        }
+        goto next_block;
+      }
+      /*
+        Prevent block from flushing and from being selected for to be
+        freed. This must be set when we release the cache_lock.
+        However, we must not set the status of the block before it is
+        assigned to this file/pos.
+      */
+      if (page_st != PAGE_WAIT_TO_BE_READ)
+        block->status|= BLOCK_FOR_UPDATE;
+      /*
+        We must read the file block first if it is not yet in the cache
+        and we do not replace all of its contents.
+
+        In cases where the cache block is big enough to contain (parts
+        of) index blocks of different indexes, our request can be
+        secondary (PAGE_WAIT_TO_BE_READ). In this case another thread is
+        reading the file block. If the read completes after us, it
+        overwrites our new contents with the old contents. So we have to
+        wait for the other thread to complete the read of this block.
+        read_block() takes care for the wait.
+      */
+      if (!(block->status & BLOCK_ERROR) &&
+          ((page_st == PAGE_TO_BE_READ &&
+            (offset || read_length < keycache->key_cache_block_size)) ||
+           (page_st == PAGE_WAIT_TO_BE_READ)))
+      {
+        read_block(keycache, thread_var, block,
+                   offset + read_length >= keycache->key_cache_block_size?
+                   offset : keycache->key_cache_block_size,
+                   offset, (page_st == PAGE_TO_BE_READ));
+        DBUG_ASSERT(keycache->can_be_used);
+        DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+        /*
+          Prevent block from flushing and from being selected for to be
+          freed. This must be set when we release the cache_lock.
+          Here we set it in case we could not set it above.
+        */
+        block->status|= BLOCK_FOR_UPDATE;
+      }
+      /*
+        The block should always be assigned to the requested file block
+        here. It need not be BLOCK_READ when overwriting the whole block.
+      */
+      DBUG_ASSERT(block->hash_link->file == file);
+      DBUG_ASSERT(block->hash_link->diskpos == filepos);
+      DBUG_ASSERT(block->status & BLOCK_IN_USE);
+      DBUG_ASSERT((page_st == PAGE_TO_BE_READ) || (block->status & BLOCK_READ));
+      /*
+        The block to be written must not be marked BLOCK_REASSIGNED.
+        Otherwise it could be freed in dirty state or reused without
+        another flush during eviction. It must also not be in flush.
+        Otherwise the old contens may have been flushed already and
+        the flusher could clear BLOCK_CHANGED without flushing the
+        new changes again.
+      */
+      DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED));
+
+      while (block->status & BLOCK_IN_FLUSHWRITE)
+      {
+        /*
+          Another thread is flushing the block. It was dirty already.
+          Wait until the block is flushed to file. Otherwise we could
+          modify the buffer contents just while it is written to file.
+          An unpredictable file block contents would be the result.
+          While we wait, several things can happen to the block,
+          including another flush. But the block cannot be reassigned to
+          another hash_link until we release our request on it.
+        */
+        wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock,
+                      thread_var);
+        DBUG_ASSERT(keycache->can_be_used);
+        DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
+        /* Still must not be marked for free. */
+        DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED));
+        DBUG_ASSERT(block->hash_link && (block->hash_link->block == block));
+      }
+
+      /*
+        We could perhaps release the cache_lock during access of the
+        data like in the other functions. Locks outside of the key cache
+        assure that readers and a writer do not access the same range of
+        data. Parallel accesses should happen only if the cache block
+        contains multiple index block(fragment)s. So different parts of
+        the buffer would be read/written. An attempt to flush during
+        memcpy() is prevented with BLOCK_FOR_UPDATE.
+      */
+      if (!(block->status & BLOCK_ERROR))
+      {
+        mysql_mutex_unlock(&keycache->cache_lock);
+        memcpy(block->buffer+offset, buff, (size_t) read_length);
+
+        mysql_mutex_lock(&keycache->cache_lock);
+      }
+
+      if (!dont_write)
+      {
+        /* Not used in the server. buff has been written to disk at start. */
+        if ((block->status & BLOCK_CHANGED) &&
+            (!offset && read_length >= keycache->key_cache_block_size))
+             link_to_file_list(keycache, block, block->hash_link->file, 1);
+      }
+      else if (! (block->status & BLOCK_CHANGED))
+        link_to_changed_list(keycache, block);
+      block->status|=BLOCK_READ;
+      /*
+        Allow block to be selected for to be freed. Since it is marked
+        BLOCK_CHANGED too, it won't be selected for to be freed without
+        a flush.
+      */
+      block->status&= ~BLOCK_FOR_UPDATE;
+      set_if_smaller(block->offset, offset);
+      set_if_bigger(block->length, read_length+offset);
+
+      /* Threads may be waiting for the changes to be complete. */
+      release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
+
+      /*
+        If only a part of the cache block is to be replaced, and the
+        rest has been read from file, then the cache lock has been
+        released for I/O and it could be possible that another thread
+        wants to evict or free the block and waits for it to be
+        released. So we must not just decrement hash_link->requests, but
+        also wake a waiting thread.
+      */
+      remove_reader(block);
+
+      /* Error injection for coverage testing. */
+      DBUG_EXECUTE_IF("key_cache_write_block_error",
+                      block->status|= BLOCK_ERROR;);
+
+      /* Do not link erroneous blocks into the LRU ring, but free them. */
+      if (!(block->status & BLOCK_ERROR))
+      {
+        /*
+          Link the block into the LRU ring if it's the last submitted
+          request for the block. This enables eviction for the block.
+        */
+        unreg_request(keycache, block, 1);
+      }
+      else
+      {
+        /* Pretend a "clean" block to avoid complications. */
+        block->status&= ~(BLOCK_CHANGED);
+        free_block(keycache, thread_var, block);
+        error= 1;
+        break;
+      }
+
+    next_block:
+      buff+= read_length;
+      filepos+= read_length+offset;
+      offset= 0;
+
+    } while ((length-= read_length));
+    goto end;
+  }
+
+no_key_cache:
+  /* Key cache is not used */
+  if (dont_write)
+  {
+    /* Used in the server. */
+    keycache->global_cache_w_requests++;
+    keycache->global_cache_write++;
+    if (locked_and_incremented)
+      mysql_mutex_unlock(&keycache->cache_lock);
+    if (my_pwrite(file, (uchar*) buff, length, filepos,
+		  MYF(MY_NABP | MY_WAIT_IF_FULL)))
+      error=1;
+    if (locked_and_incremented)
+      mysql_mutex_lock(&keycache->cache_lock);
+  }
+
+end:
+  if (locked_and_incremented)
+  {
+    dec_counter_for_resize_op(keycache);
+    mysql_mutex_unlock(&keycache->cache_lock);
+  }
+  
+  if (MYSQL_KEYCACHE_WRITE_DONE_ENABLED())
+  {
+    MYSQL_KEYCACHE_WRITE_DONE((ulong) (keycache->blocks_used *
+                                       keycache->key_cache_block_size),
+                              (ulong) (keycache->blocks_unused *
+                                       keycache->key_cache_block_size));
+  }
+
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Free block.
+
+  SYNOPSIS
+    free_block()
+      keycache          Pointer to a key cache data structure
+      thread_var        Pointer to thread specific variables
+      block             Pointer to the block to free
+
+  DESCRIPTION
+    Remove reference to block from hash table.
+    Remove block from the chain of clean blocks.
+    Add block to the free list.
+
+  NOTE
+    Block must not be free (status == 0).
+    Block must not be in free_block_list.
+    Block must not be in the LRU ring.
+    Block must not be in eviction (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH).
+    Block must not be in free (BLOCK_REASSIGNED).
+    Block must not be in flush (BLOCK_IN_FLUSH).
+    Block must not be dirty (BLOCK_CHANGED).
+    Block must not be in changed_blocks (dirty) hash.
+    Block must be in file_blocks (clean) hash.
+    Block must refer to a hash_link.
+    Block must have a request registered on it.
+*/
+
+static void free_block(KEY_CACHE *keycache,
+                       st_keycache_thread_var *thread_var,
+                       BLOCK_LINK *block)
+{
+  /*
+    Assert that the block is not free already. And that it is in a clean
+    state. Note that the block might just be assigned to a hash_link and
+    not yet read (BLOCK_READ may not be set here). In this case a reader
+    is registered in the hash_link and free_block() will wait for it
+    below.
+  */
+  DBUG_ASSERT((block->status & BLOCK_IN_USE) &&
+              !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
+                                 BLOCK_REASSIGNED | BLOCK_IN_FLUSH |
+                                 BLOCK_CHANGED | BLOCK_FOR_UPDATE)));
+  /* Assert that the block is in a file_blocks chain. */
+  DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+  /* Assert that the block is not in the LRU ring. */
+  DBUG_ASSERT(!block->next_used && !block->prev_used);
+  /*
+    IMHO the below condition (if()) makes no sense. I can't see how it
+    could be possible that free_block() is entered with a NULL hash_link
+    pointer. The only place where it can become NULL is in free_block()
+    (or before its first use ever, but for those blocks free_block() is
+    not called). I don't remove the conditional as it cannot harm, but
+    place an DBUG_ASSERT to confirm my hypothesis. Eventually the
+    condition (if()) can be removed.
+  */
+  DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
+  if (block->hash_link)
+  {
+    /*
+      While waiting for readers to finish, new readers might request the
+      block. But since we set block->status|= BLOCK_REASSIGNED, they
+      will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled
+      later.
+    */
+    block->status|= BLOCK_REASSIGNED;
+    wait_for_readers(keycache, block, thread_var);
+    /*
+      The block must not have been freed by another thread. Repeat some
+      checks. An additional requirement is that it must be read now
+      (BLOCK_READ).
+    */
+    DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
+    DBUG_ASSERT((block->status & (BLOCK_READ | BLOCK_IN_USE |
+                                  BLOCK_REASSIGNED)) &&
+                !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
+                                   BLOCK_IN_FLUSH | BLOCK_CHANGED |
+                                   BLOCK_FOR_UPDATE)));
+    DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
+    DBUG_ASSERT(!block->prev_used);
+    /*
+      Unset BLOCK_REASSIGNED again. If we hand the block to an evicting
+      thread (through unreg_request() below), other threads must not see
+      this flag. They could become confused.
+    */
+    block->status&= ~BLOCK_REASSIGNED;
+    /*
+      Do not release the hash_link until the block is off all lists.
+      At least not if we hand it over for eviction in unreg_request().
+    */
+  }
+
+  /*
+    Unregister the block request and link the block into the LRU ring.
+    This enables eviction for the block. If the LRU ring was empty and
+    threads are waiting for a block, then the block wil be handed over
+    for eviction immediately. Otherwise we will unlink it from the LRU
+    ring again, without releasing the lock in between. So decrementing
+    the request counter and updating statistics are the only relevant
+    operation in this case. Assert that there are no other requests
+    registered.
+  */
+  DBUG_ASSERT(block->requests == 1);
+  unreg_request(keycache, block, 0);
+  /*
+    Note that even without releasing the cache lock it is possible that
+    the block is immediately selected for eviction by link_block() and
+    thus not added to the LRU ring. In this case we must not touch the
+    block any more.
+  */
+  if (block->status & BLOCK_IN_EVICTION)
+    return;
+
+  /* Error blocks are not put into the LRU ring. */
+  if (!(block->status & BLOCK_ERROR))
+  {
+    /* Here the block must be in the LRU ring. Unlink it again. */
+    DBUG_ASSERT(block->next_used && block->prev_used &&
+                *block->prev_used == block);
+    unlink_block(keycache, block);
+  }
+  if (block->temperature == BLOCK_WARM)
+    keycache->warm_blocks--;
+  block->temperature= BLOCK_COLD;
+
+  /* Remove from file_blocks hash. */
+  unlink_changed(block);
+
+  /* Remove reference to block from hash table. */
+  unlink_hash(keycache, block->hash_link);
+  block->hash_link= NULL;
+
+  block->status= 0;
+  block->length= 0;
+  block->offset= keycache->key_cache_block_size;
+
+  /* Enforced by unlink_changed(), but just to be sure. */
+  DBUG_ASSERT(!block->next_changed && !block->prev_changed);
+  /* Enforced by unlink_block(): not in LRU ring nor in free_block_list. */
+  DBUG_ASSERT(!block->next_used && !block->prev_used);
+  /* Insert the free block in the free list. */
+  block->next_used= keycache->free_block_list;
+  keycache->free_block_list= block;
+  /* Keep track of the number of currently unused blocks. */
+  keycache->blocks_unused++;
+
+  /* All pending requests for this page must be resubmitted. */
+  release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
+}
+
+
+static int cmp_sec_link(BLOCK_LINK **a, BLOCK_LINK **b)
+{
+  return (((*a)->hash_link->diskpos < (*b)->hash_link->diskpos) ? -1 :
+      ((*a)->hash_link->diskpos > (*b)->hash_link->diskpos) ? 1 : 0);
+}
+
+
+/*
+  Flush a portion of changed blocks to disk,
+  free used blocks if requested
+*/
+
+static int flush_cached_blocks(KEY_CACHE *keycache,
+                               st_keycache_thread_var *thread_var,
+                               File file, BLOCK_LINK **cache,
+                               BLOCK_LINK **end,
+                               enum flush_type type)
+{
+  int error;
+  int last_errno= 0;
+  uint count= (uint) (end-cache);
+
+  /* Don't lock the cache during the flush */
+  mysql_mutex_unlock(&keycache->cache_lock);
+  /*
+     As all blocks referred in 'cache' are marked by BLOCK_IN_FLUSH
+     we are guarunteed no thread will change them
+  */
+  my_qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
+
+  mysql_mutex_lock(&keycache->cache_lock);
+  /*
+    Note: Do not break the loop. We have registered a request on every
+    block in 'cache'. These must be unregistered by free_block() or
+    unreg_request().
+  */
+  for ( ; cache != end ; cache++)
+  {
+    BLOCK_LINK *block= *cache;
+
+    /*
+      If the block contents is going to be changed, we abandon the flush
+      for this block. flush_key_blocks_int() will restart its search and
+      handle the block properly.
+    */
+    if (!(block->status & BLOCK_FOR_UPDATE))
+    {
+      /* Blocks coming here must have a certain status. */
+      DBUG_ASSERT(block->hash_link);
+      DBUG_ASSERT(block->hash_link->block == block);
+      DBUG_ASSERT(block->hash_link->file == file);
+      DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) ==
+                  (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
+      block->status|= BLOCK_IN_FLUSHWRITE;
+      mysql_mutex_unlock(&keycache->cache_lock);
+      error= (int)my_pwrite(file, block->buffer+block->offset,
+                            block->length - block->offset,
+                            block->hash_link->diskpos+ block->offset,
+                            MYF(MY_NABP | MY_WAIT_IF_FULL));
+      mysql_mutex_lock(&keycache->cache_lock);
+      keycache->global_cache_write++;
+      if (error)
+      {
+        block->status|= BLOCK_ERROR;
+        if (!last_errno)
+          last_errno= errno ? errno : -1;
+      }
+      block->status&= ~BLOCK_IN_FLUSHWRITE;
+      /* Block must not have changed status except BLOCK_FOR_UPDATE. */
+      DBUG_ASSERT(block->hash_link);
+      DBUG_ASSERT(block->hash_link->block == block);
+      DBUG_ASSERT(block->hash_link->file == file);
+      DBUG_ASSERT((block->status & ~(BLOCK_FOR_UPDATE | BLOCK_IN_EVICTION)) ==
+                  (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
+      /*
+        Set correct status and link in right queue for free or later use.
+        free_block() must not see BLOCK_CHANGED and it may need to wait
+        for readers of the block. These should not see the block in the
+        wrong hash. If not freeing the block, we need to have it in the
+        right queue anyway.
+      */
+      link_to_file_list(keycache, block, file, 1);
+    }
+    block->status&= ~BLOCK_IN_FLUSH;
+    /*
+      Let to proceed for possible waiting requests to write to the block page.
+      It might happen only during an operation to resize the key cache.
+    */
+    release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
+    /* type will never be FLUSH_IGNORE_CHANGED here */
+    if (!(type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE) &&
+        !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
+                           BLOCK_FOR_UPDATE)))
+    {
+      /*
+        Note that a request has been registered against the block in
+        flush_key_blocks_int().
+      */
+      free_block(keycache, thread_var, block);
+    }
+    else
+    {
+      /*
+        Link the block into the LRU ring if it's the last submitted
+        request for the block. This enables eviction for the block.
+        Note that a request has been registered against the block in
+        flush_key_blocks_int().
+      */
+      unreg_request(keycache, block, 1);
+    }
+
+  } /* end of for ( ; cache != end ; cache++) */
+  return last_errno;
+}
+
+
+/*
+  Flush all key blocks for a file to disk, but don't do any mutex locks.
+
+  SYNOPSIS
+    flush_key_blocks_int()
+      keycache            pointer to a key cache data structure
+      thread_var          pointer to thread specific variables
+      file                handler for the file to flush to
+      flush_type          type of the flush
+
+  NOTES
+    This function doesn't do any mutex locks because it needs to be called both
+    from flush_key_blocks and flush_all_key_blocks (the later one does the
+    mutex lock in the resize_key_cache() function).
+
+    We do only care about changed blocks that exist when the function is
+    entered. We do not guarantee that all changed blocks of the file are
+    flushed if more blocks change while this function is running.
+
+  RETURN
+    0   ok
+    1  error
+*/
+
+static int flush_key_blocks_int(KEY_CACHE *keycache,
+                                st_keycache_thread_var *thread_var,
+				File file, enum flush_type type)
+{
+  BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
+  int last_errno= 0;
+  int last_errcnt= 0;
+  DBUG_ENTER("flush_key_blocks_int");
+  DBUG_PRINT("enter",("file: %d  blocks_used: %lu  blocks_changed: %lu",
+              file, keycache->blocks_used, keycache->blocks_changed));
+
+  cache= cache_buff;
+  if (keycache->disk_blocks > 0)
+  {
+    /* Key cache exists and flush is not disabled */
+    int error= 0;
+    uint count= FLUSH_CACHE;
+    BLOCK_LINK **pos,**end;
+    BLOCK_LINK *first_in_switch= NULL;
+    BLOCK_LINK *last_in_flush;
+    BLOCK_LINK *last_for_update;
+    BLOCK_LINK *block, *next;
+#ifndef DBUG_OFF
+    uint cnt=0;
+#endif
+
+    if (type != FLUSH_IGNORE_CHANGED)
+    {
+      /*
+         Count how many key blocks we have to cache to be able
+         to flush all dirty pages with minimum seek moves
+      */
+      count= 0;
+      for (block= keycache->changed_blocks[FILE_HASH(file)] ;
+           block ;
+           block= block->next_changed)
+      {
+        if ((block->hash_link->file == file) &&
+            !(block->status & BLOCK_IN_FLUSH))
+        {
+          count++;
+          DBUG_ASSERT(count<= keycache->blocks_used);
+        }
+      }
+      /*
+        Allocate a new buffer only if its bigger than the one we have.
+        Assure that we always have some entries for the case that new
+        changed blocks appear while we need to wait for something.
+      */
+      if ((count > FLUSH_CACHE) &&
+          !(cache= (BLOCK_LINK**) my_malloc(key_memory_KEY_CACHE,
+                                            sizeof(BLOCK_LINK*)*count,
+                                            MYF(0))))
+        cache= cache_buff;
+      /*
+        After a restart there could be more changed blocks than now.
+        So we should not let count become smaller than the fixed buffer.
+      */
+      if (cache == cache_buff)
+        count= FLUSH_CACHE;
+    }
+
+    /* Retrieve the blocks and write them to a buffer to be flushed */
+restart:
+    last_in_flush= NULL;
+    last_for_update= NULL;
+    end= (pos= cache)+count;
+    for (block= keycache->changed_blocks[FILE_HASH(file)] ;
+         block ;
+         block= next)
+    {
+#ifndef DBUG_OFF
+      cnt++;
+      DBUG_ASSERT(cnt <= keycache->blocks_used);
+#endif
+      next= block->next_changed;
+      if (block->hash_link->file == file)
+      {
+        if (!(block->status & (BLOCK_IN_FLUSH | BLOCK_FOR_UPDATE)))
+        {
+          /*
+            Note: The special handling of BLOCK_IN_SWITCH is obsolete
+            since we set BLOCK_IN_FLUSH if the eviction includes a
+            flush. It can be removed in a later version.
+          */
+          if (!(block->status & BLOCK_IN_SWITCH))
+          {
+            /*
+              We care only for the blocks for which flushing was not
+              initiated by another thread and which are not in eviction.
+              Registering a request on the block unlinks it from the LRU
+              ring and protects against eviction.
+            */
+            reg_requests(keycache, block, 1);
+            if (type != FLUSH_IGNORE_CHANGED)
+            {
+              /* It's not a temporary file */
+              if (pos == end)
+              {
+                /*
+                  This should happen relatively seldom. Remove the
+                  request because we won't do anything with the block
+                  but restart and pick it again in the next iteration.
+                */
+                unreg_request(keycache, block, 0);
+                /*
+                  This happens only if there is not enough
+                  memory for the big block
+                */
+                if ((error= flush_cached_blocks(keycache, thread_var, file,
+                                                cache, end, type)))
+                {
+                  /* Do not loop infinitely trying to flush in vain. */
+                  if ((last_errno == error) && (++last_errcnt > 5))
+                    goto err;
+                  last_errno= error;
+                }
+                /*
+                  Restart the scan as some other thread might have changed
+                  the changed blocks chain: the blocks that were in switch
+                  state before the flush started have to be excluded
+                */
+                goto restart;
+              }
+              /*
+                Mark the block with BLOCK_IN_FLUSH in order not to let
+                other threads to use it for new pages and interfere with
+                our sequence of flushing dirty file pages. We must not
+                set this flag before actually putting the block on the
+                write burst array called 'cache'.
+              */
+              block->status|= BLOCK_IN_FLUSH;
+              /* Add block to the array for a write burst. */
+              *pos++= block;
+            }
+            else
+            {
+              /* It's a temporary file */
+              DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED));
+              /*
+                free_block() must not be called with BLOCK_CHANGED. Note
+                that we must not change the BLOCK_CHANGED flag outside of
+                link_to_file_list() so that it is always in the correct
+                queue and the *blocks_changed counters are correct.
+              */
+              link_to_file_list(keycache, block, file, 1);
+              if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)))
+              {
+                /* A request has been registered against the block above. */
+                free_block(keycache, thread_var, block);
+              }
+              else
+              {
+                /*
+                  Link the block into the LRU ring if it's the last
+                  submitted request for the block. This enables eviction
+                  for the block. A request has been registered against
+                  the block above.
+                */
+                unreg_request(keycache, block, 1);
+              }
+            }
+          }
+          else
+          {
+            /*
+              Link the block into a list of blocks 'in switch'.
+
+              WARNING: Here we introduce a place where a changed block
+              is not in the changed_blocks hash! This is acceptable for
+              a BLOCK_IN_SWITCH. Never try this for another situation.
+              Other parts of the key cache code rely on changed blocks
+              being in the changed_blocks hash.
+            */
+            unlink_changed(block);
+            link_changed(block, &first_in_switch);
+          }
+        }
+        else if (type != FLUSH_KEEP)
+        {
+          /*
+            During the normal flush at end of statement (FLUSH_KEEP) we
+            do not need to ensure that blocks in flush or update by
+            other threads are flushed. They will be flushed by them
+            later. In all other cases we must assure that we do not have
+            any changed block of this file in the cache when this
+            function returns.
+          */
+          if (block->status & BLOCK_IN_FLUSH)
+          {
+            /* Remember the last block found to be in flush. */
+            last_in_flush= block;
+          }
+          else
+          {
+            /* Remember the last block found to be selected for update. */
+            last_for_update= block;
+          }
+        }
+      }
+    }
+    if (pos != cache)
+    {
+      if ((error=
+           flush_cached_blocks(keycache, thread_var, file, cache, pos, type)))
+      {
+        /* Do not loop inifnitely trying to flush in vain. */
+        if ((last_errno == error) && (++last_errcnt > 5))
+          goto err;
+        last_errno= error;
+      }
+      /*
+        Do not restart here during the normal flush at end of statement
+        (FLUSH_KEEP). We have now flushed at least all blocks that were
+        changed when entering this function. In all other cases we must
+        assure that we do not have any changed block of this file in the
+        cache when this function returns.
+      */
+      if (type != FLUSH_KEEP)
+        goto restart;
+    }
+    if (last_in_flush)
+    {
+      /*
+        There are no blocks to be flushed by this thread, but blocks in
+        flush by other threads. Wait until one of the blocks is flushed.
+        Re-check the condition for last_in_flush. We may have unlocked
+        the cache_lock in flush_cached_blocks(). The state of the block
+        could have changed.
+      */
+      if (last_in_flush->status & BLOCK_IN_FLUSH)
+        wait_on_queue(&last_in_flush->wqueue[COND_FOR_SAVED],
+                      &keycache->cache_lock, thread_var);
+      /* Be sure not to lose a block. They may be flushed in random order. */
+      goto restart;
+    }
+    if (last_for_update)
+    {
+      /*
+        There are no blocks to be flushed by this thread, but blocks for
+        update by other threads. Wait until one of the blocks is updated.
+        Re-check the condition for last_for_update. We may have unlocked
+        the cache_lock in flush_cached_blocks(). The state of the block
+        could have changed.
+      */
+      if (last_for_update->status & BLOCK_FOR_UPDATE)
+        wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED],
+                      &keycache->cache_lock, thread_var);
+      /* The block is now changed. Flush it. */
+      goto restart;
+    }
+
+    /*
+      Wait until the list of blocks in switch is empty. The threads that
+      are switching these blocks will relink them to clean file chains
+      while we wait and thus empty the 'first_in_switch' chain.
+    */
+    while (first_in_switch)
+    {
+#ifndef DBUG_OFF
+      cnt= 0;
+#endif
+      wait_on_queue(&first_in_switch->wqueue[COND_FOR_SAVED],
+                    &keycache->cache_lock, thread_var);
+#ifndef DBUG_OFF
+      cnt++;
+      DBUG_ASSERT(cnt <= keycache->blocks_used);
+#endif
+      /*
+        Do not restart here. We have flushed all blocks that were
+        changed when entering this function and were not marked for
+        eviction. Other threads have now flushed all remaining blocks in
+        the course of their eviction.
+      */
+    }
+
+    if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE))
+    {
+      BLOCK_LINK *last_for_update= NULL;
+      BLOCK_LINK *last_in_switch= NULL;
+      uint total_found= 0;
+      uint found;
+
+      /*
+        Finally free all clean blocks for this file.
+        During resize this may be run by two threads in parallel.
+      */
+      do
+      {
+        found= 0;
+        for (block= keycache->file_blocks[FILE_HASH(file)] ;
+             block ;
+             block= next)
+        {
+          /* Remember the next block. After freeing we cannot get at it. */
+          next= block->next_changed;
+
+          /* Changed blocks cannot appear in the file_blocks hash. */
+          DBUG_ASSERT(!(block->status & BLOCK_CHANGED));
+          if (block->hash_link->file == file)
+          {
+            /* We must skip blocks that will be changed. */
+            if (block->status & BLOCK_FOR_UPDATE)
+            {
+              last_for_update= block;
+              continue;
+            }
+
+            /*
+              We must not free blocks in eviction (BLOCK_IN_EVICTION |
+              BLOCK_IN_SWITCH) or blocks intended to be freed
+              (BLOCK_REASSIGNED).
+            */
+            if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
+                                   BLOCK_REASSIGNED)))
+            {
+              struct st_hash_link *next_hash_link= NULL;
+              my_off_t next_diskpos= 0;
+              File next_file= 0;
+              uint next_status= 0;
+              uint hash_requests= 0;
+
+              total_found++;
+              found++;
+              DBUG_ASSERT(found <= keycache->blocks_used);
+
+              /*
+                Register a request. This unlinks the block from the LRU
+                ring and protects it against eviction. This is required
+                by free_block().
+              */
+              reg_requests(keycache, block, 1);
+
+              /*
+                free_block() may need to wait for readers of the block.
+                This is the moment where the other thread can move the
+                'next' block from the chain. free_block() needs to wait
+                if there are requests for the block pending.
+              */
+              if (next && (hash_requests= block->hash_link->requests))
+              {
+                /* Copy values from the 'next' block and its hash_link. */
+                next_status=    next->status;
+                next_hash_link= next->hash_link;
+                next_diskpos=   next_hash_link->diskpos;
+                next_file=      next_hash_link->file;
+                DBUG_ASSERT(next == next_hash_link->block);
+              }
+
+              free_block(keycache, thread_var, block);
+              /*
+                If we had to wait and the state of the 'next' block
+                changed, break the inner loop. 'next' may no longer be
+                part of the current chain.
+
+                We do not want to break the loop after every free_block(),
+                not even only after waits. The chain might be quite long
+                and contain blocks for many files. Traversing it again and
+                again to find more blocks for this file could become quite
+                inefficient.
+              */
+              if (next && hash_requests &&
+                  ((next_status    != next->status) ||
+                   (next_hash_link != next->hash_link) ||
+                   (next_file      != next_hash_link->file) ||
+                   (next_diskpos   != next_hash_link->diskpos) ||
+                   (next           != next_hash_link->block)))
+                break;
+            }
+            else
+            {
+              last_in_switch= block;
+            }
+          }
+        } /* end for block in file_blocks */
+      } while (found);
+
+      /*
+        If any clean block has been found, we may have waited for it to
+        become free. In this case it could be possible that another clean
+        block became dirty. This is possible if the write request existed
+        before the flush started (BLOCK_FOR_UPDATE). Re-check the hashes.
+      */
+      if (total_found)
+        goto restart;
+
+      /*
+        To avoid an infinite loop, wait until one of the blocks marked
+        for update is updated.
+      */
+      if (last_for_update)
+      {
+        /* We did not wait. Block must not have changed status. */
+        DBUG_ASSERT(last_for_update->status & BLOCK_FOR_UPDATE);
+        wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED],
+                      &keycache->cache_lock, thread_var);
+        goto restart;
+      }
+
+      /*
+        To avoid an infinite loop wait until one of the blocks marked
+        for eviction is switched.
+      */
+      if (last_in_switch)
+      {
+        /* We did not wait. Block must not have changed status. */
+        DBUG_ASSERT(last_in_switch->status & (BLOCK_IN_EVICTION |
+                                              BLOCK_IN_SWITCH |
+                                              BLOCK_REASSIGNED));
+        wait_on_queue(&last_in_switch->wqueue[COND_FOR_SAVED],
+                      &keycache->cache_lock, thread_var);
+        goto restart;
+      }
+
+    } /* if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) */
+
+  } /* if (keycache->disk_blocks > 0 */
+
+err:
+  if (cache != cache_buff)
+    my_free(cache);
+  if (last_errno)
+    errno=last_errno;                /* Return first error */
+  DBUG_RETURN(last_errno != 0);
+}
+
+
+/*
+  Flush all blocks for a file to disk
+
+  SYNOPSIS
+
+    flush_key_blocks()
+      keycache            pointer to a key cache data structure
+      thread_var          pointer to thread specific variables
+      file                handler for the file to flush to
+      flush_type          type of the flush
+
+  RETURN
+    0   ok
+    1  error
+*/
+
+int flush_key_blocks(KEY_CACHE *keycache,
+                     st_keycache_thread_var *thread_var,
+                     File file, enum flush_type type)
+{
+  int res= 0;
+  DBUG_ENTER("flush_key_blocks");
+  DBUG_PRINT("enter", ("keycache: 0x%lx", (long) keycache));
+
+  if (!keycache->key_cache_inited)
+    DBUG_RETURN(0);
+
+  mysql_mutex_lock(&keycache->cache_lock);
+  /* While waiting for lock, keycache could have been ended. */
+  if (keycache->disk_blocks > 0)
+  {
+    inc_counter_for_resize_op(keycache);
+    res= flush_key_blocks_int(keycache, thread_var, file, type);
+    dec_counter_for_resize_op(keycache);
+  }
+  mysql_mutex_unlock(&keycache->cache_lock);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Flush all blocks in the key cache to disk.
+
+  SYNOPSIS
+    flush_all_key_blocks()
+      keycache                  pointer to key cache root structure
+      thread_var                pointer to thread specific variables
+
+  DESCRIPTION
+
+    Flushing of the whole key cache is done in two phases.
+
+    1. Flush all changed blocks, waiting for them if necessary. Loop
+    until there is no changed block left in the cache.
+
+    2. Free all clean blocks. Normally this means free all blocks. The
+    changed blocks were flushed in phase 1 and became clean. However we
+    may need to wait for blocks that are read by other threads. While we
+    wait, a clean block could become changed if that operation started
+    before the resize operation started. To be safe we must restart at
+    phase 1.
+
+    When we can run through the changed_blocks and file_blocks hashes
+    without finding a block any more, then we are done.
+
+    Note that we hold keycache->cache_lock all the time unless we need
+    to wait for something.
+
+  RETURN
+    0           OK
+    != 0        Error
+*/
+
+static int flush_all_key_blocks(KEY_CACHE *keycache,
+                                st_keycache_thread_var *thread_var)
+{
+  BLOCK_LINK    *block;
+  uint          total_found;
+  uint          found;
+  uint          idx;
+  DBUG_ENTER("flush_all_key_blocks");
+
+  do
+  {
+    mysql_mutex_assert_owner(&keycache->cache_lock);
+    total_found= 0;
+
+    /*
+      Phase1: Flush all changed blocks, waiting for them if necessary.
+      Loop until there is no changed block left in the cache.
+    */
+    do
+    {
+      found= 0;
+      /* Step over the whole changed_blocks hash array. */
+      for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++)
+      {
+        /*
+          If an array element is non-empty, use the first block from its
+          chain to find a file for flush. All changed blocks for this
+          file are flushed. So the same block will not appear at this
+          place again with the next iteration. New writes for blocks are
+          not accepted during the flush. If multiple files share the
+          same hash bucket, one of them will be flushed per iteration
+          of the outer loop of phase 1.
+        */
+        if ((block= keycache->changed_blocks[idx]))
+        {
+          found++;
+          /*
+            Flush dirty blocks but do not free them yet. They can be used
+            for reading until all other blocks are flushed too.
+          */
+          if (flush_key_blocks_int(keycache, thread_var,
+                                   block->hash_link->file,
+                                   FLUSH_FORCE_WRITE))
+            DBUG_RETURN(1);
+        }
+      }
+
+    } while (found);
+
+    /*
+      Phase 2: Free all clean blocks. Normally this means free all
+      blocks. The changed blocks were flushed in phase 1 and became
+      clean. However we may need to wait for blocks that are read by
+      other threads. While we wait, a clean block could become changed
+      if that operation started before the resize operation started. To
+      be safe we must restart at phase 1.
+    */
+    do
+    {
+      found= 0;
+      /* Step over the whole file_blocks hash array. */
+      for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++)
+      {
+        /*
+          If an array element is non-empty, use the first block from its
+          chain to find a file for flush. All blocks for this file are
+          freed. So the same block will not appear at this place again
+          with the next iteration. If multiple files share the
+          same hash bucket, one of them will be flushed per iteration
+          of the outer loop of phase 2.
+        */
+        if ((block= keycache->file_blocks[idx]))
+        {
+          total_found++;
+          found++;
+          if (flush_key_blocks_int(keycache, thread_var,
+                                   block->hash_link->file,
+                                   FLUSH_RELEASE))
+            DBUG_RETURN(1);
+        }
+      }
+
+    } while (found);
+
+    /*
+      If any clean block has been found, we may have waited for it to
+      become free. In this case it could be possible that another clean
+      block became dirty. This is possible if the write request existed
+      before the resize started (BLOCK_FOR_UPDATE). Re-check the hashes.
+    */
+  } while (total_found);
+
+#ifndef DBUG_OFF
+  /* Now there should not exist any block any more. */
+  for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++)
+  {
+    DBUG_ASSERT(!keycache->changed_blocks[idx]);
+    DBUG_ASSERT(!keycache->file_blocks[idx]);
+  }
+#endif
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Reset the counters of a key cache.
+
+  SYNOPSIS
+    reset_key_cache_counters()
+    name       the name of a key cache
+    key_cache  pointer to the key kache to be reset
+
+  DESCRIPTION
+   This procedure is used by process_key_caches() to reset the counters of all
+   currently used key caches, both the default one and the named ones.
+
+  RETURN
+    0 on success (always because it can't fail)
+*/
+
+int reset_key_cache_counters(const char *name MY_ATTRIBUTE((unused)),
+                             KEY_CACHE *key_cache)
+{
+  DBUG_ENTER("reset_key_cache_counters");
+  if (!key_cache->key_cache_inited)
+  {
+    DBUG_PRINT("info", ("Key cache %s not initialized.", name));
+    DBUG_RETURN(0);
+  }
+  DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
+
+  key_cache->global_blocks_changed= 0;   /* Key_blocks_not_flushed */
+  key_cache->global_cache_r_requests= 0; /* Key_read_requests */
+  key_cache->global_cache_read= 0;       /* Key_reads */
+  key_cache->global_cache_w_requests= 0; /* Key_write_requests */
+  key_cache->global_cache_write= 0;      /* Key_writes */
+  DBUG_RETURN(0);
+}
+
+
+#if !defined(DBUG_OFF)
+#define F_B_PRT(_f_, _v_) DBUG_PRINT("assert_fail", (_f_, _v_))
+
+static int fail_block(BLOCK_LINK *block)
+{
+  F_B_PRT("block->next_used:    %lx\n", (ulong) block->next_used);
+  F_B_PRT("block->prev_used:    %lx\n", (ulong) block->prev_used);
+  F_B_PRT("block->next_changed: %lx\n", (ulong) block->next_changed);
+  F_B_PRT("block->prev_changed: %lx\n", (ulong) block->prev_changed);
+  F_B_PRT("block->hash_link:    %lx\n", (ulong) block->hash_link);
+  F_B_PRT("block->status:       %u\n", block->status);
+  F_B_PRT("block->length:       %u\n", block->length);
+  F_B_PRT("block->offset:       %u\n", block->offset);
+  F_B_PRT("block->requests:     %u\n", block->requests);
+  F_B_PRT("block->temperature:  %u\n", block->temperature);
+  return 0; /* Let the assert fail. */
+}
+
+static int fail_hlink(HASH_LINK *hlink)
+{
+  F_B_PRT("hlink->next:    %lx\n", (ulong) hlink->next);
+  F_B_PRT("hlink->prev:    %lx\n", (ulong) hlink->prev);
+  F_B_PRT("hlink->block:   %lx\n", (ulong) hlink->block);
+  F_B_PRT("hlink->diskpos: %lu\n", (ulong) hlink->diskpos);
+  F_B_PRT("hlink->file:    %d\n", hlink->file);
+  return 0; /* Let the assert fail. */
+}
+
+static int cache_empty(KEY_CACHE *keycache)
+{
+  int errcnt= 0;
+  int idx;
+  if (keycache->disk_blocks <= 0)
+    return 1;
+  for (idx= 0; idx < keycache->disk_blocks; idx++)
+  {
+    BLOCK_LINK *block= keycache->block_root + idx;
+    if (block->status || block->requests || block->hash_link)
+    {
+      my_message_local(INFORMATION_LEVEL, "block index: %u", idx);
+      fail_block(block);
+      errcnt++;
+    }
+  }
+  for (idx= 0; idx < keycache->hash_links; idx++)
+  {
+    HASH_LINK *hash_link= keycache->hash_link_root + idx;
+    if (hash_link->requests || hash_link->block)
+    {
+      my_message_local(INFORMATION_LEVEL, "hash_link index: %u", idx);
+      fail_hlink(hash_link);
+      errcnt++;
+    }
+  }
+  if (errcnt)
+  {
+    my_message_local(INFORMATION_LEVEL, "blocks: %d  used: %lu",
+                     keycache->disk_blocks, keycache->blocks_used);
+    my_message_local(INFORMATION_LEVEL, "hash_links: %d  used: %d",
+                     keycache->hash_links, keycache->hash_links_used);
+  }
+  return !errcnt;
+}
+#endif
+
-- 
cgit v1.1