/**
 * \brief Finds among all detected GPUs the fastest one.
 * \details Highly inspired by the helpers library in
 *  CUDA examples.
 */
static int getBestDeviceID() {
    int current_device     = 0, sm_per_multiproc  = 0;
    int max_compute_perf   = 0, max_perf_device   = 0;
    int device_count       = 0, best_SM_arch      = 0;
    int compute_perf       = 0;
    int driver_ver         = 0;
    int runtime_ver        = 0;
    struct cudaDeviceProp deviceProp;
    int retval = CUDA()->cudaGetDeviceCount(&device_count);
    if(retval == 35) {
	nl_printf("Error: Driver/CUDA versions mismatch\n");
	retval = CUDA()->cudaDriverGetVersion(&driver_ver);
	nl_printf("cudaDriverGetVersion()   retval=%d\n",retval);	
	retval = CUDA()->cudaRuntimeGetVersion(&runtime_ver);
	nl_printf("cudaRuntimeGetVersion()  retval=%d\n",retval);
	
	nl_printf("  Driver  version=%d\n",driver_ver);
	nl_printf("  Runtime version=%d\n",driver_ver);

	return -1;
    }

    CUDA()->cudaDriverGetVersion(&driver_ver);
    nl_printf("OpenNL CUDA: Driver version=%d\n",driver_ver);    

    nl_printf("Using device 0\n");	
    return 0;

    /* I no longer do this, because the deviceProp is too much
     * dependent on the CUDA version, and they keep adding
     * fields *at the beginning* of the structure ...
     * so let us take device 0 instead, will be OK
     * in most cases.
     */
    
    /* Find the best major SM Architecture GPU device */

    while (current_device < device_count) {
        CUDA()->cudaGetDeviceProperties(&deviceProp, current_device);
        /* If this GPU is not running on Compute Mode prohibited, 
           then we can add it to the list */
        if (deviceProp.computeMode != cudaComputeModeProhibited) {
            if (deviceProp.major > 0 && deviceProp.major < 9999) {
                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
            }
        }
        current_device++;
    }
    /* Find the best CUDA capable GPU device */
    current_device = 0;
    while (current_device < device_count) {
        CUDA()->cudaGetDeviceProperties(&deviceProp, current_device);
        /* If this GPU is not running on Compute Mode prohibited, 
           then we can add it to the list */
        if (deviceProp.computeMode != cudaComputeModeProhibited) {
            if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
                sm_per_multiproc = 1;
            } else {
                sm_per_multiproc = ConvertSMVer2Cores(
		    deviceProp.major, deviceProp.minor
		);
            }
            compute_perf  =
		deviceProp.multiProcessorCount *
		sm_per_multiproc * deviceProp.clockRate;
            if (compute_perf  > max_compute_perf) {
                /* If we find GPU with SM major > 2, search only these */
                if (best_SM_arch > 2) {
                    /* If our device==dest_SM_arch, choose this, or else pass */
                    if (deviceProp.major == best_SM_arch) {
                        max_compute_perf  = compute_perf;
                        max_perf_device   = current_device;
                    }
                } else {
                    max_compute_perf  = compute_perf;
                    max_perf_device   = current_device;
                }
            }
        }
        ++current_device;
    }

    /**
     * Wrong ! It says 1TFlops for GTX 1080, whereas the specs say 8TFlops
     nl_printf(
	"OpenNL CUDA: maximum device single-precision Gflops=%f\n",
	(double)(2*max_compute_perf)/(double)(1e6)
     );
    */
 
    return max_perf_device;
}

