pfsp_gpu_chpl.chpl

/*
  Single-GPU B&B to solve Taillard instances of the PFSP in Chapel.
*/

use Time;
use GpuDiagnostics;

use util;
use Pool;
use PFSP_node;
use Bound_johnson;
use Bound_simple;
use Taillard;

const allowedLowerBounds = ["lb1", "lb1_d", "lb2"];

config const BLOCK_SIZE = 512;

/*******************************************************************************
Implementation of the single-GPU PFSP search.
*******************************************************************************/

config const m = 25;
config const M = 50000;

config const inst: int = 14; // instance
config const lb: string = "lb1"; // lower bound function
config const ub: int = 1; // initial upper bound
/*
  NOTE: Only forward branching is considered because other strategies increase a
  lot the implementation complexity and do not add much contribution.
*/

const jobs = taillard_get_nb_jobs(inst);
const machines = taillard_get_nb_machines(inst);

const initUB = if (ub == 1) then taillard_get_best_ub(inst) else max(int);

proc check_parameters()
{
  if ((m <= 0) || (M <= 0)) then
    halt("Error: m and M must be positive integers.\n");

  if (inst < 1 || inst > 120) then
    halt("Error: unsupported Taillard's instance");

  if (allowedLowerBounds.find(lb) == -1) then
    halt("Error - Unsupported lower bound");

  if (ub != 0 && ub != 1) then
    halt("Error: unsupported upper bound initialization");
}

proc print_settings(): void
{
  writeln("\n=================================================");
  writeln("Single-GPU Chapel\n");
  writeln("Resolution of PFSP Taillard's instance: ta", inst, " (m = ", machines, ", n = ", jobs, ")");
  if (ub == 0) then writeln("Initial upper bound: inf");
  else /* if (ub == 1) */ writeln("Initial upper bound: opt");
  writeln("Lower bound function: ", lb);
  writeln("Branching rule: fwd");
  writeln("=================================================");
}

proc print_results(const optimum: int, const exploredTree: uint, const exploredSol: uint,
  const timer: real)
{
  writeln("\n=================================================");
  writeln("Size of the explored tree: ", exploredTree);
  writeln("Number of explored solutions: ", exploredSol);
  const is_better = if (optimum < initUB) then " (improved)"
                                          else " (not improved)";
  writeln("Optimal makespan: ", optimum, is_better);
  writeln("Elapsed time: ", timer, " [s]");
  writeln("=================================================\n");
}

proc help_message(): void
{
  writeln("\n  PFSP Benchmark Parameters:\n");
  writeln("   --inst   int   Taillard's instance to solve (between 001 and 120)");
  writeln("   --lb     str   lower bound function (lb1, lb1_d, lb2)");
  writeln("   --ub     int   initial upper bound (0, 1)\n");
}

// Evaluate and generate children nodes on CPU.
proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
  ref best: int, ref pool)
{
  for i in parent.limit1+1..(jobs-1) {
    var child = new Node();
    child.depth = parent.depth + 1;
    child.limit1 = parent.limit1 + 1;
    child.prmu = parent.prmu;
    child.prmu[parent.depth] <=> child.prmu[i];

    var lowerbound = lb1_bound(lb1_data, child.prmu, child.limit1, jobs);

    if (child.depth == jobs) { // if child leaf
      num_sol += 1;

      if (lowerbound < best) { // if child feasible
        best = lowerbound;
      }
    } else { // if not leaf
      if (lowerbound < best) { // if child feasible
        pool.pushBack(child);
        tree_loc += 1;
      }
    }
  }
}

proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
  ref best: int, ref pool)
{
  var lb_begin: MAX_JOBS*int(32);

  lb1_children_bounds(lb1_data, parent.prmu, parent.limit1, jobs, lb_begin);

  for i in parent.limit1+1..(jobs-1) {
    const job = parent.prmu[i];
    const lowerbound = lb_begin[job];

    if (parent.depth + 1 == jobs) { // if child leaf
      num_sol += 1;

      if (lowerbound < best) { // if child feasible
        best = lowerbound;
      }
    } else { // if not leaf
      if (lowerbound < best) { // if child feasible
        var child = new Node();
        child.depth = parent.depth + 1;
        child.limit1 = parent.limit1 + 1;
        child.prmu = parent.prmu;
        child.prmu[parent.depth] <=> child.prmu[i];

        pool.pushBack(child);
        tree_loc += 1;
      }
    }
  }
}

proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
  ref num_sol: uint, ref best: int, ref pool)
{
  for i in parent.limit1+1..(jobs-1) {
    var child = new Node();
    child.depth = parent.depth + 1;
    child.limit1 = parent.limit1 + 1;
    child.prmu = parent.prmu;
    child.prmu[parent.depth] <=> child.prmu[i];

    var lowerbound = lb2_bound(lb1_data, lb2_data, child.prmu, child.limit1, jobs, best);

    if (child.depth == jobs) { // if child leaf
      num_sol += 1;

      if (lowerbound < best) { // if child feasible
        best = lowerbound;
      }
    } else { // if not leaf
      if (lowerbound < best) { // if child feasible
        pool.pushBack(child);
        tree_loc += 1;
      }
    }
  }
}

// Evaluate and generate children nodes on CPU.
proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
  ref num_sol: uint, ref best: int, ref pool)
{
  select lb {
    when "lb1_d" {
      decompose_lb1_d(lb1_data, parent, tree_loc, num_sol, best, pool);
    }
    when "lb1" {
      decompose_lb1(lb1_data, parent, tree_loc, num_sol, best, pool);
    }
    otherwise { // lb2
      decompose_lb2(lb1_data, lb2_data, parent, tree_loc, num_sol, best, pool);
    }
  }
}

// Evaluate a bulk of parent nodes on GPU using lb1.
proc evaluate_gpu_lb1(const parents_d: [] Node, const size, const lbound1_d, ref bounds_d)
{
  @assertOnGpu
  foreach threadId in 0..#size {
    const parentId = threadId / jobs;
    const k = threadId % jobs;
    var parent = parents_d[parentId];
    const depth = parent.depth;
    var prmu = parent.prmu;

    if (k >= parent.limit1+1) {
      prmu[depth] <=> prmu[k];
      bounds_d[threadId] = lb1_bound(lbound1_d, prmu, parent.limit1+1, jobs);
      prmu[depth] <=> prmu[k];
    }
  }
}

/*
  NOTE: This lower bound evaluates all the children of a given parent at the same time.
  Therefore, the GPU loop is on the parent nodes and not on the children ones, in contrast
  to the other lower bounds.
*/
// Evaluate a bulk of parent nodes on GPU using lb1_d.
proc evaluate_gpu_lb1_d(const parents_d: [] Node, const size, const best, const lbound1_d, ref bounds_d)
{
  @assertOnGpu
  foreach parentId in 0..#(size/jobs) {
    var parent = parents_d[parentId];
    /* const depth = parent.depth; */
    var prmu = parent.prmu;

    var lb_begin: MAX_JOBS*int(32);

    lb1_children_bounds(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin);

    for k in 0..#jobs {
      if (k >= parent.limit1+1) {
        const job = parent.prmu[k];
        bounds_d[parentId*jobs+k] = lb_begin[job];
      }
    }
  }
}

// Evaluate a bulk of parent nodes on GPU using lb2.
proc evaluate_gpu_lb2(const parents_d: [] Node, const size, const best, const lbound1_d, const lbound2_d, ref bounds_d)
{
  @assertOnGpu
  foreach threadId in 0..#size {
    const parentId = threadId / jobs;
    const k = threadId % jobs;
    var parent = parents_d[parentId];
    const depth = parent.depth;
    var prmu = parent.prmu;

    if (k >= parent.limit1+1) {
      prmu[depth] <=> prmu[k];
      bounds_d[threadId] = lb2_bound(lbound1_d, lbound2_d, prmu, parent.limit1+1, jobs, best);
      prmu[depth] <=> prmu[k];
    }
  }
}

// Evaluate a bulk of parent nodes on GPU.
proc evaluate_gpu(const parents_d: [] Node, const size, const best, const lbound1_d, const lbound2_d, ref bounds_d)
{
  select lb {
    when "lb1_d" {
      evaluate_gpu_lb1_d(parents_d, size, best, lbound1_d, bounds_d);
    }
    when "lb1" {
      evaluate_gpu_lb1(parents_d, size, lbound1_d, bounds_d);
    }
    otherwise { // lb2
      evaluate_gpu_lb2(parents_d, size, best, lbound1_d, lbound2_d, bounds_d);
    }
  }
}

// Generate children nodes (evaluated by GPU) on CPU.
proc generate_children(const ref parents: [] Node, const size: int, const ref bounds: [] int(32),
  ref exploredTree: uint, ref exploredSol: uint, ref best: int, ref pool: SinglePool(Node))
{
  for i in 0..#size {
    const parent = parents[i];
    const depth = parent.depth;

    for j in parent.limit1+1..(jobs-1) {
      const lowerbound = bounds[j + i * jobs];

      if (depth + 1 == jobs) { // if child leaf
        exploredSol += 1;

        if (lowerbound < best) { // if child feasible
          best = lowerbound;
        }
      } else { // if not leaf
        if (lowerbound < best) { // if child feasible
          var child = new Node();
          child.depth = depth + 1;
          child.limit1 = parent.limit1 + 1;
          child.prmu = parent.prmu;
          child.prmu[depth] <=> child.prmu[j];

          pool.pushBack(child);
          exploredTree += 1;
        }
      }
    }
  }
}

// Single-GPU PFSP search.
proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint, ref elapsedTime: real)
{
  const device = here.gpus[0];

  var best: int = initUB;

  var root = new Node(jobs);

  var pool = new SinglePool(Node);
  pool.pushBack(root);

  var timer: stopwatch;

  /*
    Step 1: We perform a partial breadth-first search on CPU in order to create
    a sufficiently large amount of work for GPU computation.
  */
  timer.start();

  var lbound1 = new lb1_bound_data(jobs, machines);
  taillard_get_processing_times(lbound1.p_times, inst);
  fill_min_heads_tails(lbound1);

  var lbound2 = new lb2_bound_data(jobs, machines);
  fill_machine_pairs(lbound2/*, LB2_FULL*/);
  fill_lags(lbound1.p_times, lbound2);
  fill_johnson_schedules(lbound1.p_times, lbound2);

  while (pool.size < m) {
    var hasWork = 0;
    var parent = pool.popFront(hasWork);
    if !hasWork then break;

    decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
  }

  timer.stop();
  const res1 = (timer.elapsed(), exploredTree, exploredSol);

  writeln("\nInitial search on CPU completed");
  writeln("Size of the explored tree: ", res1[1]);
  writeln("Number of explored solutions: ", res1[2]);
  writeln("Elapsed time: ", res1[0], " [s]\n");

  /*
    Step 2: We continue the search on GPU in a depth-first manner until there
    is not enough work.
  */
  timer.start();

  var parents: [0..#M] Node = noinit;
  var bounds: [0..#(M*jobs)] int(32) = noinit;

  on device var parents_d: [0..#M] Node;
  on device var bounds_d: [0..#(M*jobs)] int(32);

  on device var lbound1_d = new lb1_bound_data(jobs, machines);
  lbound1_d.p_times   = lbound1.p_times;
  lbound1_d.min_heads = lbound1.min_heads;
  lbound1_d.min_tails = lbound1.min_tails;

  on device var lbound2_d = new lb2_bound_data(jobs, machines);
  lbound2_d.johnson_schedules  = lbound2.johnson_schedules;
  lbound2_d.lags               = lbound2.lags;
  lbound2_d.machine_pairs      = lbound2.machine_pairs;
  lbound2_d.machine_pair_order = lbound2.machine_pair_order;

  while true {
    var poolSize = pool.popBackBulk(m, M, parents);

    if (poolSize > 0) {
      /*
        TODO: Optimize 'numBounds' based on the fact that the maximum number of
        generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or
        something like that.
      */
      const numBounds = jobs * poolSize;

      parents_d = parents; // host-to-device
      on device do evaluate_gpu(parents_d, numBounds, best, lbound1_d, lbound2_d, bounds_d); // GPU kernel
      bounds = bounds_d; // device-to-host

      /*
        Each task generates and inserts its children nodes to the pool.
      */
      generate_children(parents, poolSize, bounds, exploredTree, exploredSol, best, pool);
    }
    else {
      break;
    }
  }

  timer.stop();
  const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1;

  writeln("Search on GPU completed");
  writeln("Size of the explored tree: ", res2[1]);
  writeln("Number of explored solutions: ", res2[2]);
  writeln("Elapsed time: ", res2[0], " [s]\n");

  /*
    Step 3: We complete the depth-first search on CPU.
  */
  timer.start();

  while true {
    var hasWork = 0;
    var parent = pool.popBack(hasWork);
    if !hasWork then break;

    decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
  }

  timer.stop();
  elapsedTime = timer.elapsed();
  const res3 = (elapsedTime, exploredTree, exploredSol) - res1 - res2;

  writeln("Search on CPU completed");
  writeln("Size of the explored tree: ", res3[1]);
  writeln("Number of explored solutions: ", res3[2]);
  writeln("Elapsed time: ", res3[0], " [s]");

  optimum = best;

  writeln("\nExploration terminated.");
}

proc main(args: [] string)
{
  // Helper
  for a in args[1..] {
    if (a == "-h" || a == "--help") {
      common_help_message();
      help_message();

      return 1;
    }
  }

  check_parameters();
  print_settings();

  var optimum: int;
  var exploredTree: uint = 0;
  var exploredSol: uint = 0;

  var elapsedTime: real;

  startGpuDiagnostics();

  pfsp_search(optimum, exploredTree, exploredSol, elapsedTime);

  stopGpuDiagnostics();

  print_results(optimum, exploredTree, exploredSol, elapsedTime);

  writeln("GPU diagnostics:");
  writeln("   kernel_launch: ", getGpuDiagnostics().kernel_launch);
  writeln("   host_to_device: ", getGpuDiagnostics().host_to_device);
  writeln("   device_to_host: ", getGpuDiagnostics().device_to_host);
  writeln("   device_to_device: ", getGpuDiagnostics().device_to_device);

  return 0;
}