gpu host code generation
This commit is contained in:
@@ -654,6 +654,7 @@ pub(crate) fn run_pass_manager(
|
||||
// We then run the llvm_optimize function a second time, to optimize the code which we generated
|
||||
// in the enzyme differentiation pass.
|
||||
let enable_ad = config.autodiff.contains(&config::AutoDiff::Enable);
|
||||
let enable_gpu = config.offload.contains(&config::Offload::Enable);
|
||||
let stage = if thin {
|
||||
write::AutodiffStage::PreAD
|
||||
} else {
|
||||
|
||||
439
compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
Normal file
439
compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
Normal file
@@ -0,0 +1,439 @@
|
||||
use std::ffi::CString;
|
||||
|
||||
use llvm::Linkage::*;
|
||||
use rustc_abi::Align;
|
||||
use rustc_codegen_ssa::back::write::CodegenContext;
|
||||
use rustc_codegen_ssa::traits::BaseTypeCodegenMethods;
|
||||
|
||||
use crate::builder::SBuilder;
|
||||
use crate::common::AsCCharPtr;
|
||||
use crate::llvm::AttributePlace::Function;
|
||||
use crate::llvm::{self, Linkage, Type, Value};
|
||||
use crate::{LlvmCodegenBackend, SimpleCx, attributes};
|
||||
|
||||
pub(crate) fn handle_gpu_code<'ll>(
|
||||
_cgcx: &CodegenContext<LlvmCodegenBackend>,
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
) {
|
||||
// The offload memory transfer type for each kernel
|
||||
let mut o_types = vec![];
|
||||
let mut kernels = vec![];
|
||||
let offload_entry_ty = add_tgt_offload_entry(&cx);
|
||||
for num in 0..9 {
|
||||
let kernel = cx.get_function(&format!("kernel_{num}"));
|
||||
if let Some(kernel) = kernel {
|
||||
o_types.push(gen_define_handling(&cx, kernel, offload_entry_ty, num));
|
||||
kernels.push(kernel);
|
||||
}
|
||||
}
|
||||
|
||||
gen_call_handling(&cx, &kernels, &o_types);
|
||||
}
|
||||
|
||||
// What is our @1 here? A magic global, used in our data_{begin/update/end}_mapper:
|
||||
// @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
||||
// @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
|
||||
fn generate_at_one<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Value {
|
||||
// @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
|
||||
let unknown_txt = ";unknown;unknown;0;0;;";
|
||||
let c_entry_name = CString::new(unknown_txt).unwrap();
|
||||
let c_val = c_entry_name.as_bytes_with_nul();
|
||||
let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
|
||||
let at_zero = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
|
||||
llvm::set_alignment(at_zero, Align::ONE);
|
||||
|
||||
// @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
|
||||
let struct_ident_ty = cx.type_named_struct("struct.ident_t");
|
||||
let struct_elems = vec![
|
||||
cx.get_const_i32(0),
|
||||
cx.get_const_i32(2),
|
||||
cx.get_const_i32(0),
|
||||
cx.get_const_i32(22),
|
||||
at_zero,
|
||||
];
|
||||
let struct_elems_ty: Vec<_> = struct_elems.iter().map(|&x| cx.val_ty(x)).collect();
|
||||
let initializer = crate::common::named_struct(struct_ident_ty, &struct_elems);
|
||||
cx.set_struct_body(struct_ident_ty, &struct_elems_ty, false);
|
||||
let at_one = add_unnamed_global(&cx, &"", initializer, PrivateLinkage);
|
||||
llvm::set_alignment(at_one, Align::EIGHT);
|
||||
at_one
|
||||
}
|
||||
|
||||
pub(crate) fn add_tgt_offload_entry<'ll>(cx: &'ll SimpleCx<'_>) -> &'ll llvm::Type {
|
||||
let offload_entry_ty = cx.type_named_struct("struct.__tgt_offload_entry");
|
||||
let tptr = cx.type_ptr();
|
||||
let ti64 = cx.type_i64();
|
||||
let ti32 = cx.type_i32();
|
||||
let ti16 = cx.type_i16();
|
||||
// For each kernel to run on the gpu, we will later generate one entry of this type.
|
||||
// copied from LLVM
|
||||
// typedef struct {
|
||||
// uint64_t Reserved;
|
||||
// uint16_t Version;
|
||||
// uint16_t Kind;
|
||||
// uint32_t Flags; Flags associated with the entry (see Target Region Entry Flags)
|
||||
// void *Address; Address of global symbol within device image (function or global)
|
||||
// char *SymbolName;
|
||||
// uint64_t Size; Size of the entry info (0 if it is a function)
|
||||
// uint64_t Data;
|
||||
// void *AuxAddr;
|
||||
// } __tgt_offload_entry;
|
||||
let entry_elements = vec![ti64, ti16, ti16, ti32, tptr, tptr, ti64, ti64, tptr];
|
||||
cx.set_struct_body(offload_entry_ty, &entry_elements, false);
|
||||
offload_entry_ty
|
||||
}
|
||||
|
||||
fn gen_tgt_kernel_global<'ll>(cx: &'ll SimpleCx<'_>) {
|
||||
let kernel_arguments_ty = cx.type_named_struct("struct.__tgt_kernel_arguments");
|
||||
let tptr = cx.type_ptr();
|
||||
let ti64 = cx.type_i64();
|
||||
let ti32 = cx.type_i32();
|
||||
let tarr = cx.type_array(ti32, 3);
|
||||
|
||||
// Taken from the LLVM APITypes.h declaration:
|
||||
//struct KernelArgsTy {
|
||||
// uint32_t Version = 0; // Version of this struct for ABI compatibility.
|
||||
// uint32_t NumArgs = 0; // Number of arguments in each input pointer.
|
||||
// void **ArgBasePtrs =
|
||||
// nullptr; // Base pointer of each argument (e.g. a struct).
|
||||
// void **ArgPtrs = nullptr; // Pointer to the argument data.
|
||||
// int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
|
||||
// int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
|
||||
// void **ArgNames = nullptr; // Name of the data for debugging, possibly null.
|
||||
// void **ArgMappers = nullptr; // User-defined mappers, possibly null.
|
||||
// uint64_t Tripcount =
|
||||
// 0; // Tripcount for the teams / distribute loop, 0 otherwise.
|
||||
// struct {
|
||||
// uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
|
||||
// uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
|
||||
// uint64_t Unused : 62;
|
||||
// } Flags = {0, 0, 0};
|
||||
// // The number of teams (for x,y,z dimension).
|
||||
// uint32_t NumTeams[3] = {0, 0, 0};
|
||||
// // The number of threads (for x,y,z dimension).
|
||||
// uint32_t ThreadLimit[3] = {0, 0, 0};
|
||||
// uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
|
||||
//};
|
||||
let kernel_elements =
|
||||
vec![ti32, ti32, tptr, tptr, tptr, tptr, tptr, tptr, ti64, ti64, tarr, tarr, ti32];
|
||||
|
||||
cx.set_struct_body(kernel_arguments_ty, &kernel_elements, false);
|
||||
// For now we don't handle kernels, so for now we just add a global dummy
|
||||
// to make sure that the __tgt_offload_entry is defined and handled correctly.
|
||||
cx.declare_global("my_struct_global2", kernel_arguments_ty);
|
||||
}
|
||||
|
||||
fn gen_tgt_data_mappers<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
) -> (&'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Value, &'ll llvm::Type) {
|
||||
let tptr = cx.type_ptr();
|
||||
let ti64 = cx.type_i64();
|
||||
let ti32 = cx.type_i32();
|
||||
|
||||
let args = vec![tptr, ti64, ti32, tptr, tptr, tptr, tptr, tptr, tptr];
|
||||
let mapper_fn_ty = cx.type_func(&args, cx.type_void());
|
||||
let mapper_begin = "__tgt_target_data_begin_mapper";
|
||||
let mapper_update = "__tgt_target_data_update_mapper";
|
||||
let mapper_end = "__tgt_target_data_end_mapper";
|
||||
let begin_mapper_decl = declare_offload_fn(&cx, mapper_begin, mapper_fn_ty);
|
||||
let update_mapper_decl = declare_offload_fn(&cx, mapper_update, mapper_fn_ty);
|
||||
let end_mapper_decl = declare_offload_fn(&cx, mapper_end, mapper_fn_ty);
|
||||
|
||||
let nounwind = llvm::AttributeKind::NoUnwind.create_attr(cx.llcx);
|
||||
attributes::apply_to_llfn(begin_mapper_decl, Function, &[nounwind]);
|
||||
attributes::apply_to_llfn(update_mapper_decl, Function, &[nounwind]);
|
||||
attributes::apply_to_llfn(end_mapper_decl, Function, &[nounwind]);
|
||||
|
||||
(begin_mapper_decl, update_mapper_decl, end_mapper_decl, mapper_fn_ty)
|
||||
}
|
||||
|
||||
fn add_priv_unnamed_arr<'ll>(cx: &SimpleCx<'ll>, name: &str, vals: &[u64]) -> &'ll llvm::Value {
|
||||
let ti64 = cx.type_i64();
|
||||
let mut size_val = Vec::with_capacity(vals.len());
|
||||
for &val in vals {
|
||||
size_val.push(cx.get_const_i64(val));
|
||||
}
|
||||
let initializer = cx.const_array(ti64, &size_val);
|
||||
add_unnamed_global(cx, name, initializer, PrivateLinkage)
|
||||
}
|
||||
|
||||
pub(crate) fn add_unnamed_global<'ll>(
|
||||
cx: &SimpleCx<'ll>,
|
||||
name: &str,
|
||||
initializer: &'ll llvm::Value,
|
||||
l: Linkage,
|
||||
) -> &'ll llvm::Value {
|
||||
let llglobal = add_global(cx, name, initializer, l);
|
||||
llvm::LLVMSetUnnamedAddress(llglobal, llvm::UnnamedAddr::Global);
|
||||
llglobal
|
||||
}
|
||||
|
||||
pub(crate) fn add_global<'ll>(
|
||||
cx: &SimpleCx<'ll>,
|
||||
name: &str,
|
||||
initializer: &'ll llvm::Value,
|
||||
l: Linkage,
|
||||
) -> &'ll llvm::Value {
|
||||
let c_name = CString::new(name).unwrap();
|
||||
let llglobal: &'ll llvm::Value = llvm::add_global(cx.llmod, cx.val_ty(initializer), &c_name);
|
||||
llvm::set_global_constant(llglobal, true);
|
||||
llvm::set_linkage(llglobal, l);
|
||||
llvm::set_initializer(llglobal, initializer);
|
||||
llglobal
|
||||
}
|
||||
|
||||
fn gen_define_handling<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
kernel: &'ll llvm::Value,
|
||||
offload_entry_ty: &'ll llvm::Type,
|
||||
num: i64,
|
||||
) -> &'ll llvm::Value {
|
||||
let types = cx.func_params_types(cx.get_type_of_global(kernel));
|
||||
// It seems like non-pointer values are automatically mapped. So here, we focus on pointer (or
|
||||
// reference) types.
|
||||
let num_ptr_types = types
|
||||
.iter()
|
||||
.map(|&x| matches!(cx.type_kind(x), rustc_codegen_ssa::common::TypeKind::Pointer))
|
||||
.count();
|
||||
|
||||
// We do not know their size anymore at this level, so hardcode a placeholder.
|
||||
// A follow-up pr will track these from the frontend, where we still have Rust types.
|
||||
// Then, we will be able to figure out that e.g. `&[f32;256]` will result in 4*256 bytes.
|
||||
// I decided that 1024 bytes is a great placeholder value for now.
|
||||
add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{num}"), &vec![1024; num_ptr_types]);
|
||||
// Here we figure out whether something needs to be copied to the gpu (=1), from the gpu (=2),
|
||||
// or both to and from the gpu (=3). Other values shouldn't affect us for now.
|
||||
// A non-mutable reference or pointer will be 1, an array that's not read, but fully overwritten
|
||||
// will be 2. For now, everything is 3, until we have our frontend set up.
|
||||
let o_types =
|
||||
add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{num}"), &vec![3; num_ptr_types]);
|
||||
// Next: For each function, generate these three entries. A weak constant,
|
||||
// the llvm.rodata entry name, and the omp_offloading_entries value
|
||||
|
||||
let name = format!(".kernel_{num}.region_id");
|
||||
let initializer = cx.get_const_i8(0);
|
||||
let region_id = add_unnamed_global(&cx, &name, initializer, WeakAnyLinkage);
|
||||
|
||||
let c_entry_name = CString::new(format!("kernel_{num}")).unwrap();
|
||||
let c_val = c_entry_name.as_bytes_with_nul();
|
||||
let offload_entry_name = format!(".offloading.entry_name.{num}");
|
||||
|
||||
let initializer = crate::common::bytes_in_context(cx.llcx, c_val);
|
||||
let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage);
|
||||
llvm::set_alignment(llglobal, Align::ONE);
|
||||
llvm::set_section(llglobal, c".llvm.rodata.offloading");
|
||||
|
||||
// Not actively used yet, for calling real kernels
|
||||
let name = format!(".offloading.entry.kernel_{num}");
|
||||
|
||||
// See the __tgt_offload_entry documentation above.
|
||||
let reserved = cx.get_const_i64(0);
|
||||
let version = cx.get_const_i16(1);
|
||||
let kind = cx.get_const_i16(1);
|
||||
let flags = cx.get_const_i32(0);
|
||||
let size = cx.get_const_i64(0);
|
||||
let data = cx.get_const_i64(0);
|
||||
let aux_addr = cx.const_null(cx.type_ptr());
|
||||
let elems = vec![reserved, version, kind, flags, region_id, llglobal, size, data, aux_addr];
|
||||
|
||||
let initializer = crate::common::named_struct(offload_entry_ty, &elems);
|
||||
let c_name = CString::new(name).unwrap();
|
||||
let llglobal = llvm::add_global(cx.llmod, offload_entry_ty, &c_name);
|
||||
llvm::set_global_constant(llglobal, true);
|
||||
llvm::set_linkage(llglobal, WeakAnyLinkage);
|
||||
llvm::set_initializer(llglobal, initializer);
|
||||
llvm::set_alignment(llglobal, Align::ONE);
|
||||
let c_section_name = CString::new(".omp_offloading_entries").unwrap();
|
||||
llvm::set_section(llglobal, &c_section_name);
|
||||
o_types
|
||||
}
|
||||
|
||||
fn declare_offload_fn<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
name: &str,
|
||||
ty: &'ll llvm::Type,
|
||||
) -> &'ll llvm::Value {
|
||||
crate::declare::declare_simple_fn(
|
||||
cx,
|
||||
name,
|
||||
llvm::CallConv::CCallConv,
|
||||
llvm::UnnamedAddr::No,
|
||||
llvm::Visibility::Default,
|
||||
ty,
|
||||
)
|
||||
}
|
||||
|
||||
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
|
||||
// the gpu. We don't have a proper frontend yet, so we assume that every call to a kernel function
|
||||
// from main is intended to run on the GPU. For now, we only handle the data transfer part of it.
|
||||
// If two consecutive kernels use the same memory, we still move it to the host and back to the gpu.
|
||||
// Since in our frontend users (by default) don't have to specify data transfer, this is something
|
||||
// we should optimize in the future! We also assume that everything should be copied back and forth,
|
||||
// but sometimes we can directly zero-allocate on the device and only move back, or if something is
|
||||
// immutable, we might only copy it to the device, but not back.
|
||||
//
|
||||
// Current steps:
|
||||
// 0. Alloca some variables for the following steps
|
||||
// 1. set insert point before kernel call.
|
||||
// 2. generate all the GEPS and stores, to be used in 3)
|
||||
// 3. generate __tgt_target_data_begin calls to move data to the GPU
|
||||
//
|
||||
// unchanged: keep kernel call. Later move the kernel to the GPU
|
||||
//
|
||||
// 4. set insert point after kernel call.
|
||||
// 5. generate all the GEPS and stores, to be used in 6)
|
||||
// 6. generate __tgt_target_data_end calls to move data from the GPU
|
||||
fn gen_call_handling<'ll>(
|
||||
cx: &'ll SimpleCx<'_>,
|
||||
_kernels: &[&'ll llvm::Value],
|
||||
o_types: &[&'ll llvm::Value],
|
||||
) {
|
||||
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
|
||||
let tptr = cx.type_ptr();
|
||||
let ti32 = cx.type_i32();
|
||||
let tgt_bin_desc_ty = vec![ti32, tptr, tptr, tptr];
|
||||
let tgt_bin_desc = cx.type_named_struct("struct.__tgt_bin_desc");
|
||||
cx.set_struct_body(tgt_bin_desc, &tgt_bin_desc_ty, false);
|
||||
|
||||
gen_tgt_kernel_global(&cx);
|
||||
let (begin_mapper_decl, _, end_mapper_decl, fn_ty) = gen_tgt_data_mappers(&cx);
|
||||
|
||||
let main_fn = cx.get_function("main");
|
||||
let Some(main_fn) = main_fn else { return };
|
||||
let kernel_name = "kernel_1";
|
||||
let call = unsafe {
|
||||
llvm::LLVMRustGetFunctionCall(main_fn, kernel_name.as_c_char_ptr(), kernel_name.len())
|
||||
};
|
||||
let Some(kernel_call) = call else {
|
||||
return;
|
||||
};
|
||||
let kernel_call_bb = unsafe { llvm::LLVMGetInstructionParent(kernel_call) };
|
||||
let called = unsafe { llvm::LLVMGetCalledValue(kernel_call).unwrap() };
|
||||
let mut builder = SBuilder::build(cx, kernel_call_bb);
|
||||
|
||||
let types = cx.func_params_types(cx.get_type_of_global(called));
|
||||
let num_args = types.len() as u64;
|
||||
|
||||
// Step 0)
|
||||
// %struct.__tgt_bin_desc = type { i32, ptr, ptr, ptr }
|
||||
// %6 = alloca %struct.__tgt_bin_desc, align 8
|
||||
unsafe { llvm::LLVMRustPositionBuilderPastAllocas(builder.llbuilder, main_fn) };
|
||||
|
||||
let tgt_bin_desc_alloca = builder.direct_alloca(tgt_bin_desc, Align::EIGHT, "EmptyDesc");
|
||||
|
||||
let ty = cx.type_array(cx.type_ptr(), num_args);
|
||||
// Baseptr are just the input pointer to the kernel, stored in a local alloca
|
||||
let a1 = builder.direct_alloca(ty, Align::EIGHT, ".offload_baseptrs");
|
||||
// Ptrs are the result of a gep into the baseptr, at least for our trivial types.
|
||||
let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs");
|
||||
// These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16.
|
||||
let ty2 = cx.type_array(cx.type_i64(), num_args);
|
||||
let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes");
|
||||
// Now we allocate once per function param, a copy to be passed to one of our maps.
|
||||
let mut vals = vec![];
|
||||
let mut geps = vec![];
|
||||
let i32_0 = cx.get_const_i32(0);
|
||||
for (index, in_ty) in types.iter().enumerate() {
|
||||
// get function arg, store it into the alloca, and read it.
|
||||
let p = llvm::get_param(called, index as u32);
|
||||
let name = llvm::get_value_name(p);
|
||||
let name = str::from_utf8(&name).unwrap();
|
||||
let arg_name = format!("{name}.addr");
|
||||
let alloca = builder.direct_alloca(in_ty, Align::EIGHT, &arg_name);
|
||||
|
||||
builder.store(p, alloca, Align::EIGHT);
|
||||
let val = builder.load(in_ty, alloca, Align::EIGHT);
|
||||
let gep = builder.inbounds_gep(cx.type_f32(), val, &[i32_0]);
|
||||
vals.push(val);
|
||||
geps.push(gep);
|
||||
}
|
||||
|
||||
// Step 1)
|
||||
unsafe { llvm::LLVMRustPositionBefore(builder.llbuilder, kernel_call) };
|
||||
builder.memset(tgt_bin_desc_alloca, cx.get_const_i8(0), cx.get_const_i64(32), Align::EIGHT);
|
||||
|
||||
let mapper_fn_ty = cx.type_func(&[cx.type_ptr()], cx.type_void());
|
||||
let register_lib_decl = declare_offload_fn(&cx, "__tgt_register_lib", mapper_fn_ty);
|
||||
let unregister_lib_decl = declare_offload_fn(&cx, "__tgt_unregister_lib", mapper_fn_ty);
|
||||
let init_ty = cx.type_func(&[], cx.type_void());
|
||||
let init_rtls_decl = declare_offload_fn(cx, "__tgt_init_all_rtls", init_ty);
|
||||
|
||||
// call void @__tgt_register_lib(ptr noundef %6)
|
||||
builder.call(mapper_fn_ty, register_lib_decl, &[tgt_bin_desc_alloca], None);
|
||||
// call void @__tgt_init_all_rtls()
|
||||
builder.call(init_ty, init_rtls_decl, &[], None);
|
||||
|
||||
for i in 0..num_args {
|
||||
let idx = cx.get_const_i32(i);
|
||||
let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, idx]);
|
||||
builder.store(vals[i as usize], gep1, Align::EIGHT);
|
||||
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
|
||||
builder.store(geps[i as usize], gep2, Align::EIGHT);
|
||||
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
|
||||
// As mentioned above, we don't use Rust type information yet. So for now we will just
|
||||
// assume that we have 1024 bytes, 256 f32 values.
|
||||
// FIXME(offload): write an offload frontend and handle arbitrary types.
|
||||
builder.store(cx.get_const_i64(1024), gep3, Align::EIGHT);
|
||||
}
|
||||
|
||||
// For now we have a very simplistic indexing scheme into our
|
||||
// offload_{baseptrs,ptrs,sizes}. We will probably improve this along with our gpu frontend pr.
|
||||
fn get_geps<'a, 'll>(
|
||||
builder: &mut SBuilder<'a, 'll>,
|
||||
cx: &'ll SimpleCx<'ll>,
|
||||
ty: &'ll Type,
|
||||
ty2: &'ll Type,
|
||||
a1: &'ll Value,
|
||||
a2: &'ll Value,
|
||||
a4: &'ll Value,
|
||||
) -> (&'ll Value, &'ll Value, &'ll Value) {
|
||||
let i32_0 = cx.get_const_i32(0);
|
||||
|
||||
let gep1 = builder.inbounds_gep(ty, a1, &[i32_0, i32_0]);
|
||||
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, i32_0]);
|
||||
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, i32_0]);
|
||||
(gep1, gep2, gep3)
|
||||
}
|
||||
|
||||
fn generate_mapper_call<'a, 'll>(
|
||||
builder: &mut SBuilder<'a, 'll>,
|
||||
cx: &'ll SimpleCx<'ll>,
|
||||
geps: (&'ll Value, &'ll Value, &'ll Value),
|
||||
o_type: &'ll Value,
|
||||
fn_to_call: &'ll Value,
|
||||
fn_ty: &'ll Type,
|
||||
num_args: u64,
|
||||
s_ident_t: &'ll Value,
|
||||
) {
|
||||
let nullptr = cx.const_null(cx.type_ptr());
|
||||
let i64_max = cx.get_const_i64(u64::MAX);
|
||||
let num_args = cx.get_const_i32(num_args);
|
||||
let args =
|
||||
vec![s_ident_t, i64_max, num_args, geps.0, geps.1, geps.2, o_type, nullptr, nullptr];
|
||||
builder.call(fn_ty, fn_to_call, &args, None);
|
||||
}
|
||||
|
||||
// Step 2)
|
||||
let s_ident_t = generate_at_one(&cx);
|
||||
let o = o_types[0];
|
||||
let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
|
||||
generate_mapper_call(&mut builder, &cx, geps, o, begin_mapper_decl, fn_ty, num_args, s_ident_t);
|
||||
|
||||
// Step 3)
|
||||
// Here we will add code for the actual kernel launches in a follow-up PR.
|
||||
// FIXME(offload): launch kernels
|
||||
|
||||
// Step 4)
|
||||
unsafe { llvm::LLVMRustPositionAfter(builder.llbuilder, kernel_call) };
|
||||
|
||||
let geps = get_geps(&mut builder, &cx, ty, ty2, a1, a2, a4);
|
||||
generate_mapper_call(&mut builder, &cx, geps, o, end_mapper_decl, fn_ty, num_args, s_ident_t);
|
||||
|
||||
builder.call(mapper_fn_ty, unregister_lib_decl, &[tgt_bin_desc_alloca], None);
|
||||
|
||||
// With this we generated the following begin and end mappers. We could easily generate the
|
||||
// update mapper in an update.
|
||||
// call void @__tgt_target_data_begin_mapper(ptr @1, i64 -1, i32 3, ptr %27, ptr %28, ptr %29, ptr @.offload_maptypes, ptr null, ptr null)
|
||||
// call void @__tgt_target_data_update_mapper(ptr @1, i64 -1, i32 2, ptr %46, ptr %47, ptr %48, ptr @.offload_maptypes.1, ptr null, ptr null)
|
||||
// call void @__tgt_target_data_end_mapper(ptr @1, i64 -1, i32 3, ptr %49, ptr %50, ptr %51, ptr @.offload_maptypes, ptr null, ptr null)
|
||||
}
|
||||
@@ -118,6 +118,10 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
|
||||
r
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn const_null(&self, t: &'ll Type) -> &'ll Value {
|
||||
unsafe { llvm::LLVMConstNull(t) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ll, 'tcx> ConstCodegenMethods for CodegenCx<'ll, 'tcx> {
|
||||
@@ -377,6 +381,11 @@ pub(crate) fn bytes_in_context<'ll>(llcx: &'ll llvm::Context, bytes: &[u8]) -> &
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn named_struct<'ll>(ty: &'ll Type, elts: &[&'ll Value]) -> &'ll Value {
|
||||
let len = c_uint::try_from(elts.len()).expect("LLVMConstStructInContext elements len overflow");
|
||||
unsafe { llvm::LLVMConstNamedStruct(ty, elts.as_ptr(), len) }
|
||||
}
|
||||
|
||||
fn struct_in_context<'ll>(
|
||||
llcx: &'ll llvm::Context,
|
||||
elts: &[&'ll Value],
|
||||
|
||||
@@ -412,6 +412,20 @@ impl ModuleLlvm {
|
||||
}
|
||||
}
|
||||
|
||||
fn tm_from_cgcx(
|
||||
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
||||
name: &str,
|
||||
dcx: DiagCtxtHandle<'_>,
|
||||
) -> Result<OwnedTargetMachine, FatalError> {
|
||||
let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name);
|
||||
match (cgcx.tm_factory)(tm_factory_config) {
|
||||
Ok(m) => Ok(m),
|
||||
Err(e) => {
|
||||
return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse(
|
||||
cgcx: &CodegenContext<LlvmCodegenBackend>,
|
||||
name: &CStr,
|
||||
@@ -421,13 +435,7 @@ impl ModuleLlvm {
|
||||
unsafe {
|
||||
let llcx = llvm::LLVMRustContextCreate(cgcx.fewer_names);
|
||||
let llmod_raw = back::lto::parse_module(llcx, name, buffer, dcx)?;
|
||||
let tm_factory_config = TargetMachineFactoryConfig::new(cgcx, name.to_str().unwrap());
|
||||
let tm = match (cgcx.tm_factory)(tm_factory_config) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
return Err(dcx.emit_almost_fatal(ParseTargetMachineConfig(e)));
|
||||
}
|
||||
};
|
||||
let tm = ModuleLlvm::tm_from_cgcx(cgcx, name.to_str().unwrap(), dcx)?;
|
||||
|
||||
Ok(ModuleLlvm { llmod_raw, llcx, tm: ManuallyDrop::new(tm) })
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user