#! /usr/bin/python
#
# Copyright (C) 2016 Citrix Systems R&D Ltd.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation; version 2.1 only. with the special
# exception on linking described in file LICENSE.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.

import os
import re
import sys
import socket
import tempfile
import errno
import stat
import pwd
import grp
from subprocess import call
import ctypes
import ctypes.util
import os
from resource import getrlimit, RLIMIT_CORE, RLIMIT_FSIZE, setrlimit

import xen.lowlevel.xs as xs

import qemu_trad_image

#
# Constants from Xen's public/hvm/e820.h
#
HVM_BELOW_4G_RAM_END = 0xf0000000
HVM_BELOW_4G_MMIO_START = HVM_BELOW_4G_RAM_END
HVM_BELOW_4G_MMIO_LENGTH = (1 << 32) - HVM_BELOW_4G_MMIO_START

dic_mac = {}
open_fds = []

xenstore = xs.xs()

# Set cgroup_slice to the name of the cgroup slice the qemu-dm process
# should live in.
#  - None means leave in the same slice as the parent process.
#  - '' means move it into the default slice.
#  - 'system.slice' means move it into the system slice, etc.
# If the nominated slice does not already exist, the process will be
# left in its parent's slice.
cgroup_slice = ''

CLONE_NEWNS = 0x00020000 # mount namespace
CLONE_NEWNET = 0x40000000 # network namespace
CLONE_NEWIPC = 0x08000000 # IPC namespace

def unshare(flags):
    libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True)
    unshare_prototype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, use_errno=True)
    unshare = unshare_prototype(('unshare', libc))
    ret = unshare(flags)
    if ret < 0:
        raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno()))

def restrict_fsize():
    limit = 256 * 1024
    setrlimit(RLIMIT_FSIZE, (limit, limit))

def enable_core_dumps():

    limit = 64 * 1024 * 1024
    oldlimits = getrlimit(RLIMIT_CORE)
    hardlimit = oldlimits[1]
    if limit > hardlimit:
        hardlimit = limit
        setrlimit(RLIMIT_CORE, (limit, hardlimit))
        return limit

def xenstore_read(path):
    return xenstore.read("", path)

def xenstore_write(path, value):
    return xenstore.write("", path, value)

def xenstore_ls(path):
    return xenstore.ls("", path)

def get_drive_args_for_backend(dom, backend):
    args = []
    index_map = dict(('xvd' + chr(ord('a') + i), i) for i in range(4))
    vbd3 = "/local/domain/0/backend/" + backend + "/" + dom

    medialist = xenstore_ls(vbd3)
    if not medialist:
        return args

    for num in medialist:
        dev = xenstore_read("%s/%s/dev" % (vbd3, num))
        if dev not in index_map:
            continue

        path = ''
        path_str = ''
        keys = xenstore_ls("%s/%s" % (vbd3, num))
        if "params" in keys:
            path = xenstore_read("%s/%s/params" % (vbd3, num))
            nbd_prefix = "nbd:unix:"
            if path.startswith(nbd_prefix):
                path = path + ":exportname=qemu_node"
            path_str = 'file=%s,' % path

        # XXX This is a heuristic to determine if the disk is a CD. We should be told
        # by the toolstack.
        mode = xenstore_read("%s/%s/mode" % (vbd3, num))
        media = 'cdrom' if dev == 'xvdd' and mode == 'r' else 'disk'
        forcelba = 'on' if media == 'disk' else 'off'

        format_str = ',format=raw' if path != '' else ''

        if mode == 'r' and media != 'cdrom':
            continue

        # XXX What should cache=... be set to?
        args.extend(["-drive", "%sif=ide,index=%d,media=%s%s,force-lba=%s" %
                     (path_str, index_map[dev], media, format_str, forcelba)])

    return args

def get_drive_args(dom):
    backends = ['vbd3', 'qdisk']
    args = []
    for backend in backends:
        args.extend(get_drive_args_for_backend(dom, backend))
    return args

def close_fds():
    for i in open_fds:
        os.close(i)

def main(argv):
    f = sys.stdout

    f.write("Arguments: %s\n" % " ".join(argv[1:]))

    for n in range(len(argv)):
        if argv[n] == "-xen-domid":
            domid = int(argv[n+1])
            break

    qemu_dm = '/usr/lib64/xen/bin/qemu-system-i386'
    drive_args = get_drive_args('%d' % domid)
    qemu_args = ['qemu-dm-%d' % domid]

    mmio_start = HVM_BELOW_4G_MMIO_START
    # vGPU now requires extra space in lower MMIO hole by default
    if '-vgpu' in argv:
        mmio_start -= HVM_BELOW_4G_MMIO_LENGTH

    # read the size of lower MMIO hole provisioned by xenguest
    mmio_size = xenstore_read("/local/domain/%d/vm-data/mmio-hole-size" % domid)
    if mmio_size:
        mmio_start = (1 << 32) - int(mmio_size)

    # don't allow hvmloader to change the lower MMIO hole size
    xenstore_write("/local/domain/%d/hvmloader/allow-memory-relocate" % domid, "0")
    # prepare the correct set of ACPI tables in hvmloader
    xenstore_write("/local/domain/%d/platform/device-model" % domid, "qemu_xen")

    trad_compat = 'true'
    igdpt = ''

    qemu_args.extend(['-machine',
                      'pc-0.10,accel=xen,max-ram-below-4g=%lu,'
                      'allow-unassigned=true,trad_compat=%s%s'
                      % (mmio_start, trad_compat, igdpt)])

    qemu_args.extend(argv[2:])
    qemu_args.extend(drive_args)

    # support toolstack override of NIC device model for debug purposes
    nic = xenstore_read("/local/domain/%d/platform/nic_type" % domid)
    if nic:
        qemu_args = map(lambda x: x.replace("rtl8139", nic), qemu_args)

    n = 0

    vga_type = 'cirrus-vga'
    vgamem_mb = 4
    vga_extra_props = ["rombar=1,romfile=",
                       "subvendor_id=0x5853,subsystem_id=0x0001,addr=2"]
    upgraded_save_image = None
    serial_c = ''
    tmp_serial_c = ''
    depriv = True

    while n < len(qemu_args):
        p = qemu_args[n]

        if p == "-netdev":
            params = qemu_args[n + 1].split(',')
            for param in params:
                if param.startswith('fd='):
                    open_fds.append(int(param.split('=')[1]))

        if p == "--syslog":
            del qemu_args[n]
            continue

        if p == "-priv":
            del qemu_args[n]
            depriv = False
            continue

        if p == "-acpi":
            del qemu_args[n]
            continue

        if p == "-videoram":
            vgamem_mb = int(qemu_args[n+1])
            del qemu_args[n]
            del qemu_args[n]
            continue

        if p == "-std-vga":
            vga_type = 'VGA'
            vga_extra_props += ['qemu-extended-regs=false']
            del qemu_args[n]
            continue

        if p == "-vgpu":
            vga_type = 'vgpu'
            del qemu_args[n]
            continue

        if p == "-xengt":
            vga_type = 'VGA'
            vga_extra_props += ['qemu-extended-regs=false']
            n += 1
            continue

        if p == "-loadvm":
            loadvm_path = qemu_args[n+1]
            if qemu_trad_image.is_trad_image(loadvm_path):
                f.write("QEMU Traditional image detected. Upgrading...\n")

                loadvm_file = open(loadvm_path, "rb")
                incoming_file = tempfile.TemporaryFile()
                upgraded_save_image = qemu_trad_image.convert_file(loadvm_file, incoming_file)
                loadvm_file.close()

                incoming_file.seek(0)
                incoming_fd = os.dup(incoming_file.fileno())
            else:
                incoming_fd = os.open(loadvm_path, os.O_RDONLY)
            qemu_args[n] = "-incoming"
            qemu_args[n+1] = "fd:%d" % incoming_fd

        n += 1

    if vga_type == 'vgpu':
        qemu_args += ["-device", "vgpu"]
    else:
        qemu_args += ["-device", ",".join([vga_type, "vgamem_mb=%d" % vgamem_mb] \
                                          + vga_extra_props)]

    s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM)
    open_fds.append(s1.fileno())
    qemu_args += ["-vnc-clipboard-socket-fd", str(s1.fileno())]
    #qemu_args += ["-trace", "events=/root/trace"]
    #qemu_args += ["-monitor", "tcp:127.0.0.1:7777,server,nowait"]

    if depriv:
        root_dir = "/var/xen/qemu/root-{}".format(domid)
        try:
            os.makedirs(root_dir + "/dev", 0755)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        try:
            os.mknod(root_dir + "/dev/null", 0644 | stat.S_IFCHR,
                     os.makedev(1, 3))
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        qemu_args += ["-xen-domid-restrict"]
        qemu_args += ["-chroot", root_dir]

        uid = pwd.getpwnam('qemu_base').pw_uid + domid
        gid = grp.getgrnam('qemu_base').gr_gid + domid
        qemu_args += ["-runas", "%d.%d" % (uid, gid)]

    xenstore_write("/libxl/%d/dm-version" % domid, "qemu_xen")

    f.write("Exec: %s %s\n" % (qemu_dm, " ".join(qemu_args)))

    clipboardd = '/opt/xensource/libexec/xs-clipboardd'
    call([clipboardd, "-d", str(domid), "-s", str(s2.fileno())], preexec_fn = lambda : (close_fds()))

    s2.close()

    # set up library preload path for qemu such that it can use jemalloc
    qemu_env = os.environ
    if not qemu_env.has_key("LD_PRELOAD"):
       qemu_env["LD_PRELOAD"] = "/usr/lib64/libjemalloc.so.1"
    else:
       qemu_env["LD_PRELOAD"] = "/usr/lib64/libjemalloc.so.1:" + qemu_env["LD_PRELOAD"]
    qemu_env["MALLOC_CONF"] = "narenas:1,tcache:false,lg_dirty_mult:22"

    core_dump_limit = enable_core_dumps()
    f.write("core dump limit: %d\n" % core_dump_limit)

    pid = os.getpid()
    xenstore_write("/local/domain/%d/qemu-pid" % domid, "%d" % pid)

    if cgroup_slice is not None:
        # Move to nominated cgroup slice
        f.write("Moving to cgroup slice '%s'\n" % cgroup_slice)
        try:
            # Note the default slice uses /sys/fs/cgroup/cpu/tasks but
            # other.slice uses /sys/fs/cgroup/cpu/other.slice/tasks.
            g = open("/sys/fs/cgroup/cpu/%s/tasks" % cgroup_slice, 'w')
            g.write(str(pid))
            g.close()
        except IOError, e:
            print "Warning: writing pid to '%s' tasks file: %s" \
                % (cgroup_slice, e)

    restrict_fsize()

    unshare(CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWIPC)

    f.flush()
    os.execve(qemu_dm, qemu_args, qemu_env)

if __name__ == '__main__':
    raise SystemExit(main(sys.argv))
