/* $Source: bitbucket.org:berkeleylab/gasnet.git/other/kinds/gasnet_cuda_uva.c $ * Description: GASNet Memory Kinds Implementation for CUDA UVA devices * Copyright (c) 2020, The Regents of the University of California * Terms of use are as specified in license.txt */ #define GASNETI_NEED_GASNET_MK_H 1 #include #include #if GASNET_HAVE_MK_CLASS_CUDA_UVA // Else empty #include GASNETI_IDENT(gasneti_IdentString_MKClassCUDAUVA, "$GASNetMKClassCUDAUVA: 1 $"); // // Class-specific MK type and functions // typedef struct my_MK_s { GASNETI_MK_COMMON // Class-indep prefix CUcontext ctx; CUdevice dev; int use_sync_memops; } *my_MK_t; // Wrapper and format for use of cuGetErrorName() const char *_gasneti_cuerror_name(CUresult res) { static const char *unknown = "UNKNOWN"; const char *errorname; if (cuGetErrorName(res, &errorname)) errorname = unknown; return errorname; } #define GASNETI_CURESULT_FMT "%s(%d)" #define GASNETI_CURESULT_STRING(res) _gasneti_cuerror_name(res),(res) // // Error checking/reporting wrapper // #define gasneti_check_cudacall(op) do { \ CUresult _retval = (op); \ if_pf (_retval) { \ gasneti_fatalerror("%s returned "GASNETI_CURESULT_FMT,#op,GASNETI_CURESULT_STRING(_retval));\ } \ } while (0) static const char *gasneti_formatmk_cuda_uva(gasneti_MK_t i_mk) { my_MK_t kind = (my_MK_t) i_mk; return gasneti_dynsprintf("CUDA_UVA(gex_CUdevice=%d)", (int)kind->dev); } static void gasneti_MK_Destroy_cuda_uva( gasneti_MK_t i_mk, gex_Flags_t flags) { my_MK_t mk = (my_MK_t) i_mk; gasneti_check_cudacall(cuDevicePrimaryCtxRelease(mk->dev)); gasneti_free_mk(i_mk); } static int gasneti_MK_Segment_Create_cuda_uva( gasneti_Segment_t *i_segment_p, gasneti_MK_t i_mk, void * addr, uintptr_t size, gex_Flags_t flags) { my_MK_t kind = (my_MK_t) i_mk; CUdeviceptr dptr; CUresult result; void * to_free = NULL; int retval = GASNET_OK; gasneti_check_cudacall(cuCtxPushCurrent(kind->ctx)); // TODO: // Might want additional care with respect to error returns from the CUDA device API. // In particular, any call "may also return error codes from previous, asynchronous launches." // Presently, we try to always provide the specific CUDA error code as we fatalerror. if (addr) { // Client-allocated dptr = (CUdeviceptr)addr; // cuPointerGetAttributes available since CUDA 7.0 unsigned int mem_type = 0; unsigned int is_managed = 0; CUcontext ctx = NULL; void * ptrs[3] = { (void*)&mem_type, (void*)&is_managed, (void*)&ctx }; CUpointer_attribute attrs[3] = { CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_IS_MANAGED, CU_POINTER_ATTRIBUTE_CONTEXT }; result = cuPointerGetAttributes(3, attrs, ptrs, dptr); if (result) { gasneti_fatalerror("Failed to query pointer attributes of client-allocated memory: " GASNETI_CURESULT_FMT, GASNETI_CURESULT_STRING(result)); } if (mem_type != CU_MEMORYTYPE_DEVICE) { gasneti_fatalerror("Invalid call to gex_Segment_Create(CUDA_UVA) with non-device memory"); } if (is_managed) { gasneti_fatalerror("Invalid call to gex_Segment_Create(CUDA_UVA) with managed memory"); } // We currently accept memory allocated by *any* context for the same device. // TODO: should we be more strict by checking equality of contexts instead of devices? CUdevice dev; if ((result = cuCtxPushCurrent(ctx)) || (result = cuCtxGetDevice(&dev)) || (result = cuCtxPopCurrent(&ctx))) { gasneti_fatalerror("Failed to query CUDA device of client-allocated memory: " GASNETI_CURESULT_FMT, GASNETI_CURESULT_STRING(result)); } else if (dev != kind->dev) { gasneti_fatalerror("gex_Segment_Create(CUDA_UVA) with memory associated with wrong device"); } } else { // GASNet-allocated result = cuMemAlloc(&dptr, size); if (result == CUDA_ERROR_OUT_OF_MEMORY) { retval = GASNET_ERR_RESOURCE; goto out; } else if (result != CUDA_SUCCESS) { gasneti_fatalerror("cuMemAlloc() returned unexpected failure: " GASNETI_CURESULT_FMT, GASNETI_CURESULT_STRING(result)); } addr = to_free = (void *) dptr; } if (kind->use_sync_memops) { int one = 1; gasneti_check_cudacall(cuPointerSetAttribute(&one, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dptr)); } gasneti_Client_t client = i_mk->_client; gex_MK_t e_mk = gasneti_export_mk(i_mk); gasneti_Segment_t i_segment = gasneti_alloc_segment(client, addr, size, e_mk, !to_free, flags); i_segment->_opaque_mk_use = to_free; *i_segment_p = i_segment; out: { CUcontext prev_ctx; gasneti_check_cudacall(cuCtxPopCurrent(&prev_ctx)); gasneti_assert(prev_ctx == kind->ctx); } return retval; } static void gasneti_MK_Segment_Destroy_cuda_uva( gasneti_Segment_t i_segment) { CUdeviceptr to_free = (CUdeviceptr)(i_segment->_opaque_mk_use); if (to_free) { my_MK_t kind = (my_MK_t) gasneti_import_mk_nonhost(i_segment->_kind); CUcontext prev_ctx; gasneti_check_cudacall( cuCtxPushCurrent(kind->ctx) ); gasneti_check_cudacall( cuMemFree(to_free) ); gasneti_check_cudacall( cuCtxPopCurrent(&prev_ctx) ); gasneti_assert(prev_ctx == kind->ctx); } } // // Class-specific "impl(ementation)": constants and function pointers. // // Due to lack of designated initializers in GASNet's required C99 subset, we // address the fragility as the structure grows or changes by lazy explicit // initialization. static gasneti_mk_impl_t *get_impl(void) { // Static storage duration ensures these are zero-initialized static gasneti_mk_impl_t the_impl; static gasneti_mk_impl_t *result; if (!result) { static gasneti_mutex_t lock = GASNETI_MUTEX_INITIALIZER; gasneti_mutex_lock(&lock); if (!result) { the_impl.mk_class = GEX_MK_CLASS_CUDA_UVA; the_impl.mk_name = "CUDA_UVA"; the_impl.mk_sizeof = sizeof(struct my_MK_s); the_impl.mk_format = &gasneti_formatmk_cuda_uva; the_impl.mk_destroy = &gasneti_MK_Destroy_cuda_uva; the_impl.mk_segment_create = &gasneti_MK_Segment_Create_cuda_uva; the_impl.mk_segment_destroy = &gasneti_MK_Segment_Destroy_cuda_uva; gasneti_sync_writes(); result = &the_impl; } gasneti_mutex_unlock(&lock); } else { gasneti_sync_reads(); } gasneti_assert(result); return result; } // Class-specific create int gasneti_MK_Create_cuda_uva( gasneti_MK_t *i_memkind_p, gasneti_Client_t client, const gex_MK_Create_args_t *args, gex_Flags_t flags) { CUdevice dev = args->gex_args.gex_class_cuda_uva.gex_CUdevice; GASNETI_TRACE_PRINTF(O,("gex_MK_Create: class=CUDA_UVA gex_CUdevice=%d", dev)); if (dev < 0) { // This is always treated as programmer error gasneti_fatalerror("gex_MK_Create called with negative CUdevice=%i", dev); } // Obtain the primary context for the given device, initializing if needed CUcontext ctx; CUresult res = cuDevicePrimaryCtxRetain(&ctx, dev); if (res == CUDA_ERROR_NOT_INITIALIZED) { int initRes = cuInit(0); if (initRes == CUDA_SUCCESS) { res = cuDevicePrimaryCtxRetain(&ctx, dev); } else if (initRes == CUDA_ERROR_NO_DEVICE) { GASNETI_RETURN_ERRR(BAD_ARG,"GEX_MK_CLASS_CUDA_UVA: no CUDA devices found"); } else { const char *errorname; if (cuGetErrorName(initRes, &errorname)) errorname = "UNKNOWN"; const char *msg = gasneti_dynsprintf("GEX_MK_CLASS_CUDA_UVA: cuInit() returned %s(%i)", errorname, initRes); GASNETI_RETURN_ERRR(BAD_ARG,msg); } } // Failed to obtain the primary context, try to reason out why // TODO: explicit diagnosis of more failure cases if_pf (res != CUDA_SUCCESS) { const char *why = "unknown failure"; if (res == CUDA_ERROR_INVALID_DEVICE) { int dev_count; if (cuDeviceGetCount(&dev_count)) { why = "cuDeviceGetCount() failed"; } else if (! dev_count) { why = "no CUDA devices found"; } else { why = gasneti_dynsprintf("invalid CUdevice=%i (%d devices found)", dev, dev_count); } } else { const char *errorname; if (cuGetErrorName(res, &errorname)) errorname = "UNKNOWN"; why = gasneti_dynsprintf("cuDevicePrimaryCtxRetain() returned %s(%i)", errorname ,res); } const char *msg = gasneti_dynsprintf("GEX_MK_CLASS_CUDA_UVA: %s", why); GASNETI_RETURN_ERRR(BAD_ARG,msg); } int isUVA; if (cuDeviceGetAttribute(&isUVA, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev)) { GASNETI_RETURN_ERRR(BAD_ARG,"GEX_MK_CLASS_CUDA_UVA: failed to query CUDA device for UVA support"); } if (!isUVA) { GASNETI_RETURN_ERRR(BAD_ARG,"GEX_MK_CLASS_CUDA_UVA: passed context for a non-UVA device"); } my_MK_t result = (my_MK_t) gasneti_alloc_mk(client, get_impl(), flags); result->dev = dev; result->ctx = ctx; // TODO: could be a per-device setting? // TODO: is '1' the best default? result->use_sync_memops = gasneti_getenv_yesno_withdefault("GASNET_USE_CUDA_SYNC_MEMOPS", 1); *i_memkind_p = (gasneti_MK_t) result; return GASNET_OK; } #endif