#!/usr/bin/ruby
#
# Copyright (C) 2010,2011 Red Hat, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA  02110-1301, USA.  A copy of the GNU General Public License is
# also available at http://www.gnu.org/copyleft/gpl.html.

$: << File.join(File.dirname(__FILE__), "../app")

require 'rubygems'
require 'optparse'
require 'singleton'
require 'logger'
require 'util/taskomatic'
require 'fileutils'

help = false
daemon = true
dbomatic_log_dir = "/var/log/aeolus-conductor"
dbomatic_pid_dir = "/var/run/aeolus-conductor"
# $dbomatic_timeout is also our polling interval between cloud status checks
$dbomatic_timeout = 60
$realms_timeout = 300
$deltacloud_timeout = 30
optparse = OptionParser.new do |opts|

  opts.banner = <<BANNER
Usage:
dbomatic [options]

Options:
BANNER
  opts.on( '-d', '--deltacloud-timeout N', 'Time(in seconds) to wait for backend clouds to respond (defaults to #{$deltacloud_timeout})', Integer) do |timeout|
    $deltacloud_timeout = timeout
  end
  opts.on( '-f', '--pid-file PATH', "Use PATH to the dbomatic pid directory (defaults to #{dbomatic_pid_dir})") do |newpath|
    dbomatic_pid_dir = newpath
  end
  opts.on( '-h', '--help', '') { help = true }
  opts.on( '-l', '--log PATH', "Use PATH to the dbomatic log directory (defaults to #{dbomatic_log_dir}).  Use '-' for stdout") do |newpath|
    dbomatic_log_dir = newpath
  end
  opts.on( '-n', '--nodaemon', 'Do not daemonize (useful in combination with -l for debugging)') { daemon = false }
  opts.on( '-t', '--timeout N', 'Time out (in seconds) between refreshes (defaults to #{$dbomatic_timeout})', Integer) do |timeout|
    $dbomatic_timeout = timeout
  end
  opts.on( '-r', '--realms-timeout N', 'Time out (in seconds) between refreshes of realms (defaults to #{$realms_timeout})', Integer) do |timeout|
    $realms_timeout = timeout
  end
end

begin
  optparse.parse!
rescue OptionParser::InvalidOption => e
  puts "Invalid option #{e.args}"
  puts
  puts optparse
  exit(1)
end

if help
  puts optparse
  exit(0)
end

if dbomatic_log_dir == '-'
  DBOMATIC_LOG_FILE        = STDOUT
else
  DBOMATIC_LOG_FILE        = "#{dbomatic_log_dir}/dbomatic.log"
end

# Custom Log Format
class DBomaticLogger < Logger
  include Singleton

  def initialize
    super(DBOMATIC_LOG_FILE)
  end

  def format_message(severity, timestamp, progname, msg)
    "#{timestamp.strftime('%Y-%m-%d %H:%M:%S')} #{severity} #{msg}\n"
  end
end

logger = DBomaticLogger.instance
logger.level = Logger::INFO
logger.datetime_format = "%Y-%m-%d %H:%M:%S "  # simplify time output
logger.info "DBOmatic starting up"

# daemonize
if daemon
  # if we are daemonizing under systemd, it cares very deeply that we write
  # the PID file *before* the original parent goes away (due to possible race
  # conditions).  Therefore we use the pipe trick; the original parent opens
  # a pipe, and then waits around for the final child to write a byte to it
  # to signal completion
  rd, wr = IO.pipe

  pid = fork
  if pid
    # parent

    # doesn't need the write end of the pipe, so close it out
    wr.close

    # wait around for data to come in from the pipe
    data = rd.read(1)
    rd.close

    if data.nil?
      # assume the child wrote the error
      exit 1
    end

    exit 0
  else
    # child
    rd.close

    Process.setsid
    exit if fork

    Dir.chdir "/"

    File.umask 022

    STDIN.reopen "/dev/null"       # Free file descriptors and
    STDOUT.reopen "/dev/null", "a" # point them somewhere sensible.
    STDERR.reopen '/dev/null', 'a'

    trap("TERM") { exit }

    begin
      DBOMATIC_PID_FILE = "#{dbomatic_pid_dir}/dbomatic.pid"
      FileUtils.mkdir_p File.dirname(DBOMATIC_PID_FILE)
      open(DBOMATIC_PID_FILE, "w") {|f| f.write(Process.pid) }
      File.chmod(0644, DBOMATIC_PID_FILE)
    rescue Exception => e
      logger.error "#{e.backtrace.shift}: #{e.message}"
      e.backtrace.each do |step|
        logger.error "\tfrom #{step}"
      end
      raise
    end

    wr.write("a")
    wr.close
  end

end

def self.now
  Time.now.strftime('%Y-%m-%d %H:%M:%S')
end

def collect_accounts
  accounts = []
  Pool.all.each do |pool|
    pool.instances.each do |instance|
      if instance.provider_account and instance.state != Instance::STATE_NEW and not accounts.include?(instance.provider_account)
        accounts << instance.provider_account
      end
    end
  end
  accounts
end

# Extract 'ipv4' and 'hostname' addresses from Deltacloud
def extract_addresses(address_list)
  addresses = []
  address_list.each do |address|
    addresses << address[:address] if ['ipv4', 'hostname'].include?(address[:type])
  end
  addresses
end

def check_one_account(account)
  connection = account.connect

  account.instances.each do |instance|
    # optimization; right now we ignore instances that are in the STOPPED, NEW, or CREATE_FAILED states.
    # when we get to stateful instances, this will need to change
    unless [Instance::STATE_NEW, Instance::STATE_STOPPED, Instance::STATE_CREATE_FAILED].include?(instance.state)
      begin
        api_instance = connection.instance(instance.external_key)
      rescue Exception => e
        DBomaticLogger.instance.warn("caught deltacloud exception #{e} when updating instance #{instance.name}")
        api_instance = nil
      end

      if api_instance
        DBomaticLogger.instance.info("updating instance state for #{instance.name}: #{instance.external_key}")
        instance.state = Taskomatic.dcloud_to_instance_state(api_instance.state)

        # only update the public and private addresses if they are not nil.
        # this prevents us from deleting known information about instances
        if (addresses = extract_addresses(api_instance.public_addresses)).present?
          instance.public_addresses = addresses.join(',')
        end
        if (addresses = extract_addresses(api_instance.private_addresses)).present?
          instance.private_addresses = addresses.join(',')
        end
        # Only update the instance / create an event if anything has changed!
        instance.save! if instance.changed?
      else
        # We have an instance in our database, but it didn't come back over the API
        DBomaticLogger.instance.info("known instance missing from provider: #{instance.name} #{instance.external_key}")
        instance.update_attribute(:state, Instance::STATE_VANISHED)
      end
    end

    # For RHEV, we need to start up the instance after the vm has been created
    # and state changes from PENDING to STOPPED
    if instance.state == Instance::STATE_STOPPED and instance.total_state_time(Instance::STATE_RUNNING) == 0
      api_instance = connection.instance(instance.external_key)
      if api_instance
        DBomaticLogger.instance.info("starting instance #{instance.name}: #{instance.external_key}")
        # TODO: do we need to set an actual user here instead of nil?
        task = instance.queue_action(nil, 'start')
        unless task
          raise ActionError.new("start cannot be performed on this instance.")
        end
        Taskomatic.start_instance(task)
        instance.state = Taskomatic.dcloud_to_instance_state(api_instance.state)
        instance.save!
      end
    end
  end
end

def run_parallel(list)
  # the idea here is that we fork off one process for each provider account.
  # this is so that one slow or non-responsive deltacloud doesn't hold up the
  # status for all of them
  pids = []

  # sharing a single database connection amongst multiple processes causes
  # havoc with ActiveRecord.  Instead, close down the connection in the parent
  # before forking, and then have each child re-establish their own connection.
  # We also have to have the parent re-establish the connection so that
  # "collect_accounts" works properly
  ActiveRecord::Base.logger = DBomaticLogger.instance
  ActiveRecord::Base.remove_connection
  list.each do |item|
    pid = Process.fork
    if pid.nil?
      # child
      ActiveRecord::Base.establish_connection
      yield item
      Kernel.exit!
    else
      # parent
      pids << pid
    end
  end
  ActiveRecord::Base.establish_connection

  # only the parent gets here, as all children exit above
  start = Time.now.to_i
  while not pids.empty?
    pid = Process.wait(pid=-1, flags=Process::WNOHANG)
    if not pid.nil?
      pids.delete(pid)
      next
    end

    # before sleeping, see if we have exceeded the timeout.  If so, kill off
    # all remaining children and just hope for the best next time around
    if (Time.now.to_i - start) > $deltacloud_timeout
      DBomaticLogger.instance.warn "Connection timeout of #{$deltacloud_timeout} seconds exceeded, backend clouds could not be contacted.  Will try again in #{$dbomatic_timeout} seconds"
      pids.each {|pid| Process.kill(9, pid)}
    end

    sleep 1
  end
end

def refresh_instances
  DBomaticLogger.instance.info "Deltacloud instances refresh started"
  run_parallel(collect_accounts) do |account|
    check_one_account(account)
  end
  DBomaticLogger.instance.info "Deltacloud instances refresh completed"
end

def refresh_realms
  DBomaticLogger.instance.info "Deltacloud realms refresh started"
  run_parallel(Provider.all) do |provider|
    provider.populate_realms
  end
  DBomaticLogger.instance.info "Deltacloud realms refresh completed"
end

begin
  # load in the rails models; we delay it until here because the Process.daemon
  # above wreaks havoc with the database connection
  require File.expand_path(File.dirname(__FILE__) + '/../config/environment')
rescue Exception => e
  logger.error "#{e.backtrace.shift}: #{e.message}"
  e.backtrace.each do |step|
    logger.error "\tfrom #{step}"
  end
  raise
end

last_realms_refresh = 0

logger.info "Beginning main event loop"
while true
  logger.info "Deltacloud refresh started"
  begin
    refresh_instances
    if (Time.now.to_i - last_realms_refresh) > $realms_timeout
      last_realms_refresh = Time.now.to_i
      refresh_realms
    end
  rescue Exception => e
    logger.error "#{e.backtrace.shift}: #{e.message}"
    e.backtrace.each do |step|
      logger.error "\tfrom #{step}"
    end
  end
  logger.info "Deltacloud refresh completed"
  sleep $dbomatic_timeout
end
