From d724497cbbcd05ecb8e46c02a54ba6fb808c1d06 Mon Sep 17 00:00:00 2001
Message-Id: <d724497cbbcd05ecb8e46c02a54ba6fb808c1d06.1334770230.git.minovotn@redhat.com>
In-Reply-To: <5e4659718c6d6ee9ab11b269d929a292a71b3ab0.1334770230.git.minovotn@redhat.com>
References: <5e4659718c6d6ee9ab11b269d929a292a71b3ab0.1334770230.git.minovotn@redhat.com>
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 13 Apr 2012 16:27:23 +0200
Subject: [PATCH 15/18] block: add mirror job

RH-Author: Paolo Bonzini <pbonzini@redhat.com>
Message-id: <1334334446-31987-14-git-send-email-pbonzini@redhat.com>
Patchwork-id: 39226
O-Subject: [RHEL 6.3 qemu-kvm PATCH 13/16] block: add mirror job
Bugzilla: 806432
RH-Acked-by: Kevin Wolf <kwolf@redhat.com>
RH-Acked-by: Laszlo Ersek <lersek@redhat.com>
RH-Acked-by: Jeffrey Cody <jcody@redhat.com>

Bugzilla: 806432

Upstream status: submitted as part of the mirroring forward-port

This patch adds the implementation of a new job that mirrors a disk to
a new image while letting the guest continue using the old image.
The target is treated as a "black box" and data is copied from the
source to the target in the background.

The mirror job is never-ending, but it is logically structured into
two phases: 1) copy all data as fast as possible until the target
first gets in sync with the source; 2) keep target in sync and
ensure that reopening to the target gets a correct (full) copy
of the source data.

The second phase is indicated by the progress in "info block-jobs"
reporting the current offset to be equal to the length of the file.
When the job is cancelled in the second phase, QEMU will run the
job until the source is clean and quiescent, then it will report
successful completion of the job.  (Note that it could already happen
that management lost the race against QEMU and got a completion
event instead of cancellation).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Conflicts:

	Makefile.objs
	block_int.h
---
 Makefile.objs  |    2 +-
 block/mirror.c |  307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h    |    7 ++
 trace-events   |    4 +
 4 files changed, 319 insertions(+), 1 deletion(-)
 create mode 100644 block/mirror.c

Signed-off-by: Michal Novotny <minovotn@redhat.com>
---
 Makefile.objs  |    2 +-
 block/mirror.c |  307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h    |    7 ++
 trace-events   |    4 +
 4 files changed, 319 insertions(+), 1 deletions(-)
 create mode 100644 block/mirror.c

diff --git a/Makefile.objs b/Makefile.objs
index e1fc3a2..96729e7 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -26,7 +26,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o
-block-nested-y += stream.o
+block-nested-y += stream.o mirror.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/mirror.c b/block/mirror.c
new file mode 100644
index 0000000..eb5770a
--- /dev/null
+++ b/block/mirror.c
@@ -0,0 +1,307 @@
+/*
+ * Image mirroring
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block_int.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
+};
+
+#define SLICE_TIME 100ULL /* ms */
+
+typedef struct {
+    int64_t next_slice_time;
+    uint64_t slice_quota;
+    uint64_t dispatched;
+} RateLimit;
+
+static int64_t ratelimit_calculate_delay(RateLimit *limit, uint64_t n)
+{
+    int64_t delay_ms = 0;
+    int64_t now = qemu_get_clock(rt_clock);
+
+    if (limit->next_slice_time < now) {
+        limit->next_slice_time = now + SLICE_TIME;
+        limit->dispatched = 0;
+    }
+    if (limit->dispatched + n > limit->slice_quota) {
+        delay_ms = limit->next_slice_time - now;
+    } else {
+        limit->dispatched += n;
+    }
+    return MAX(0, delay_ms);
+}
+
+static void ratelimit_set_speed(RateLimit *limit, uint64_t speed)
+{
+    limit->slice_quota = speed / (1000ULL / SLICE_TIME);
+}
+
+typedef struct MirrorBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *target;
+    bool full;
+} MirrorBlockJob;
+
+static int coroutine_fn mirror_populate(BlockDriverState *source,
+                                        BlockDriverState *target,
+                                        int64_t sector_num, int nb_sectors,
+                                        void *buf)
+{
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len  = nb_sectors * 512,
+    };
+    QEMUIOVector qiov;
+    int ret;
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* Copy-on-read the unallocated clusters */
+    ret = bdrv_co_readv(source, sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        return ret;
+    }
+    return bdrv_co_writev(target, sector_num, nb_sectors, &qiov);
+}
+
+static int is_any_allocated(BlockDriverState *bs, int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    BlockDriverState *intermediate;
+    int ret, n, unalloc = nb_sectors;
+
+    intermediate = bs;
+    while (intermediate) {
+        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
+                                   &n);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            break;
+        } else {
+            unalloc = MIN(unalloc, n);
+        }
+
+        intermediate = intermediate->backing_hd;
+    }
+
+    *pnum = ret ? n : unalloc;
+    return ret;
+}
+
+static void coroutine_fn mirror_run(void *opaque)
+{
+    MirrorBlockJob *s = opaque;
+    BlockDriverState *bs = s->common.bs;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n;
+    bool synced = false;
+    void *buf;
+
+    if (block_job_is_cancelled(&s->common)) {
+        goto immediate_exit;
+    }
+
+    s->common.len = bdrv_getlength(bs);
+    if (s->common.len < 0) {
+        block_job_complete(&s->common, s->common.len);
+        return;
+    }
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    buf = qemu_blockalign(bs, BLOCK_SIZE);
+
+    /* First part, loop on the sectors and initialize the dirty bitmap.  */
+    for (sector_num = 0; sector_num < end; ) {
+        int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+        if (s->full) {
+            ret = is_any_allocated(bs, sector_num, next - sector_num, &n);
+        } else {
+            ret = bdrv_co_is_allocated(bs, sector_num, next - sector_num, &n);
+        }
+        if (ret < 0) {
+            break;
+        }
+
+        if (ret == 1) {
+            bdrv_set_dirty(bs, sector_num, n);
+            sector_num = next;
+        } else {
+            sector_num += n;
+        }
+    }
+
+    if (ret < 0) {
+        block_job_complete(&s->common, ret);
+    }
+
+    sector_num = -1;
+    for (;;) {
+        int64_t cnt;
+        s->common.busy = true;
+        if (bdrv_get_dirty_count(bs) == 0) {
+            /* Switch out of the streaming phase.  From now on, if the
+             * job is cancelled we will actually complete all pending
+             * I/O and report completion, so that drive-reopen can be
+             * used to pivot to the mirroring target.
+             */
+            synced = true;
+            sector_num = -1;
+            s->common.offset = end * BDRV_SECTOR_SIZE;
+        }
+
+        if (synced && block_job_is_cancelled(&s->common)) {
+            /* The dirty bitmap is not updated while operations are pending.
+             * If we're about to exit, wait for pending operations or we may
+             * exit while the source has dirty data to copy!
+             */
+            while (bdrv_get_dirty_count(bs) == 0 &&
+                   !QLIST_EMPTY(&bs->tracked_requests)) {
+                qemu_aio_wait();
+            }
+        }
+
+        if (bdrv_get_dirty_count(bs) != 0) {
+            int nb_sectors;
+            sector_num = bdrv_get_next_dirty(bs, sector_num);
+            nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - sector_num);
+            trace_mirror_one_iteration(s, sector_num);
+            bdrv_reset_dirty(bs, sector_num, BDRV_SECTORS_PER_DIRTY_CHUNK);
+            ret = mirror_populate(bs, s->target, sector_num, nb_sectors, buf);
+            if (ret < 0) {
+                break;
+            }
+        }
+
+        ret = 0;
+        cnt = bdrv_get_dirty_count(bs);
+        if (synced) {
+            if (!block_job_is_cancelled(&s->common)) {
+                s->common.busy = false;
+                co_sleep(rt_clock, cnt == 0 ? SLICE_TIME : 0);
+            } else if (cnt == 0 && QLIST_EMPTY(&bs->tracked_requests)) {
+                /* The two disks are in sync.  Exit and report successful
+                 * successful completion.
+                 */
+                s->common.cancelled = false;
+                break;
+            }
+
+            /* We get here either to poll the target, or because the job
+             * was cancelled.  In the latter case, we still have an
+             * opportunity to do I/O (without going to sleep) before
+             * exiting.
+             */
+        } else {
+            uint64_t delay_ms;
+
+            /* Publish progress */
+            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+
+            if (s->common.speed) {
+                delay_ms = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+            } else {
+                delay_ms = 0;
+            }
+
+            /* Note that even when no rate limit is applied we need to yield
+             * with no pending I/O here so that qemu_aio_flush() returns.
+             */
+            s->common.busy = false;
+            co_sleep(rt_clock, delay_ms);
+            if (block_job_is_cancelled(&s->common)) {
+                break;
+            }
+        }
+    }
+
+immediate_exit:
+    bdrv_set_dirty_tracking(bs, false);
+    bdrv_close(s->target);
+    bdrv_delete(s->target);
+    block_job_complete(&s->common, ret);
+}
+
+static int mirror_set_speed(BlockJob *job, int64_t value)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    if (value < 0) {
+        return -EINVAL;
+    }
+    ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE);
+    return 0;
+}
+
+static BlockJobType mirror_job_type = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = "mirror",
+    .set_speed     = mirror_set_speed,
+};
+
+int mirror_start(BlockDriverState *bs,
+                 const char *target, BlockDriver *drv, int flags,
+                 BlockDriverCompletionFunc *cb,
+                 void *opaque, bool full)
+{
+    MirrorBlockJob *s;
+    int ret;
+
+    s = block_job_create(&mirror_job_type, bs, cb, opaque);
+    if (!s) {
+        return -EBUSY; /* bs must already be in use */
+    }
+
+    s->target = bdrv_new("");
+    ret = bdrv_open(s->target, target,
+                    flags | BDRV_O_NO_BACKING | BDRV_O_NO_FLUSH | BDRV_O_CACHE_WB,
+                    drv);
+
+    if (ret < 0) {
+        bdrv_delete(s->target);
+        return ret;
+    }
+
+    s->full = full;
+    bdrv_set_dirty_tracking(bs, true);
+    s->common.co = qemu_coroutine_create(mirror_run);
+    trace_mirror_start(bs, s, s->common.co, opaque);
+    return 0;
+}
+
+void mirror_abort(BlockDriverState *bs)
+{
+    MirrorBlockJob *s = container_of(bs->job, MirrorBlockJob, common);
+
+    if (s) {
+        block_job_cancel(&s->common);
+    }
+    qemu_coroutine_enter(s->common.co, s);
+}
+
+void mirror_commit(BlockDriverState *bs)
+{
+    MirrorBlockJob *s = container_of(bs->job, MirrorBlockJob, common);
+
+    assert(s->common.bs == bs);
+    qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block_int.h b/block_int.h
index 037ca61..adb234a 100644
--- a/block_int.h
+++ b/block_int.h
@@ -317,6 +317,13 @@ int stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *base_id, BlockDriverCompletionFunc *cb,
                  void *opaque);
 
+int mirror_start(BlockDriverState *bs,
+                 const char *target, BlockDriver *drv, int flags,
+                 BlockDriverCompletionFunc *cb,
+                 void *opaque, bool full);
+void mirror_abort(BlockDriverState *bs);
+void mirror_commit(BlockDriverState *bs);
+
 typedef struct BlockConf {
     BlockDriverState *bs;
     uint16_t physical_block_size;
diff --git a/trace-events b/trace-events
index 42a3e15..5a616a8 100644
--- a/trace-events
+++ b/trace-events
@@ -63,6 +63,10 @@ disable bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write
 disable bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 disable bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
 
+# block/mirror.c
+disable mirror_one_iteration(void *s, int64_t sector_num) "s %p sector_num %"PRId64""
+disable mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
+
 # block/stream.c
 disable stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
 disable stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
-- 
1.7.7.6