Kernel panic when bringing up DRBD resource

dw.emplod asked:

I’m trying to set up two machines synchonizing with DRBD. The storage is setup as follows: PV -> LVM -> DRBD -> CLVM -> GFS2.

DRBD is set up in dual primary mode. The first server is set up and running fine in primary mode. The drives on the first server have data on them. I’ve set up the second server and I’m trying to bring up the DRBD resources. I created all the base LVM’s to match the first server. After initializing the resources with

drbdadm create-md storage

I’m bringing up the resources by issuing

drbdadm up storage

After issuing that command, I get a kernel panic and the server reboots in 30 seconds. Here’s a screen capture.

enter image description here

My configuration is as follows:
OS: CentOS 6

uname -a
Linux host.structuralcomponents.net 2.6.32-279.5.2.el6.x86_64 #1 SMP Fri Aug 24 01:07:11 UTC 2012 x86_64 x86_64 x86_64 GNU/Linux

rpm -qa | grep drbd

kmod-drbd84-8.4.1-2.el6.elrepo.x86_64
drbd84-utils-8.4.1-2.el6.elrepo.x86_64

cat /etc/drbd.d/global_common.conf

global {
        usage-count yes;
        # minor-count dialog-refresh disable-ip-verification
}

common {
    handlers {
            pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
            pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
            local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
            # fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
            # split-brain "/usr/lib/drbd/notify-split-brain.sh root";
            # out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
            # before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
            # after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
    }

    startup {
            # wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb
            become-primary-on       both;
            wfc-timeout             30;
            degr-wfc-timeout        10;
            outdated-wfc-timeout    10;
    }

    options {
            # cpu-mask on-no-data-accessible
    }

    disk {
            # size max-bio-bvecs on-io-error fencing disk-barrier disk-flushes
            # disk-drain md-flushes resync-rate resync-after al-extents
            # c-plan-ahead c-delay-target c-fill-target c-max-rate
            # c-min-rate disk-timeout

    }

    net {
            # protocol timeout max-epoch-size max-buffers unplug-watermark
            # connect-int ping-int sndbuf-size rcvbuf-size ko-count
            # allow-two-primaries cram-hmac-alg shared-secret after-sb-0pri
            # after-sb-1pri after-sb-2pri always-asbp rr-conflict
            # ping-timeout data-integrity-alg tcp-cork on-congestion
            # congestion-fill congestion-extents csums-alg verify-alg
            # use-rle
            protocol C;
            allow-two-primaries yes;
            after-sb-0pri   discard-zero-changes;
            after-sb-1pri   discard-secondary;
            after-sb-2pri   disconnect;
    }
}

cat /etc/drbd.d/storage.res

resource storage {
        device          /dev/drbd0;
        meta-disk       internal;

        on host.structuralcomponents.net {
                address         10.10.1.120:7788;
                disk            /dev/vg_storage/lv_storage;
        }
        on host2.structuralcomponents.net {
                address         10.10.1.121:7788;
                disk            /dev/vg_storage/lv_storage;
        }

/var/log/messages is not logging anything about the crash.

I’ve been trying to find a cause of this but I’ve come up with nothing. Can anyone help me out? Thanks.

My answer:


Machine check exception is a hardware issue. You can use mcelog to interpret it, if you can boot the system.

The resolution is to replace the failing hardware. Since it looks like you’re most likely leasing a server, contact the provider.


View the full question and answer on Server Fault.

Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.