File Util.h¶

File List > Intern > rayx-core > src > Tracer > Util.h
#pragma once

#include <alpaka/alpaka.hpp>
#include <optional>
#include <vector>

#include "Debug/Instrumentor.h"
#include "Shader/Rand.h"
#include "Shader/RaysPtr.h"

namespace rayx {

template <typename Acc, typename T>
using OptBuf = std::optional<alpaka::Buf<Acc, T, alpaka::DimInt<1>, int32_t>>;

// this struct is analog to struct Rays. It contains OptBufs instead of vectors, so it can be used as buffers on CPU or GPU
template <typename Acc>
struct RaysBuf {
#define X(type, name, flag) OptBuf<Acc, type> name;

    RAYX_X_MACRO_RAY_ATTR
#undef X
};

template <typename Acc>
RaysPtr raysBufToRaysPtr(RaysBuf<Acc>& buf) {
    return RaysPtr{
#define X(type, name, flag) .name = buf.name ? alpaka::getPtrNative(*buf.name) : nullptr,

        RAYX_X_MACRO_RAY_ATTR
#undef X
    };
}

inline int ceilIntDivision(const int dividend, const int divisor) { return (divisor + dividend - 1) / divisor; }

inline int nextPowerOfTwo(const int value) { return static_cast<int>(glm::pow(2, glm::ceil(glm::log(value) / glm::log(2)))); }

inline int nextMultiple(const int value, const int divisor) {
    if (divisor == 0) RAYX_EXIT << "error: divisor must not be zero";

    auto remainder = value % divisor;

    if (remainder == 0)
        return value;  // already a multiple
    else
        return value + (divisor - remainder);  // next bigger multiple
}
template <typename Queue, typename Buf>
inline void allocBuf(Queue q, std::optional<Buf>& buf, const int size) {
    using Idx  = alpaka::Idx<Buf>;
    using Elem = alpaka::Elem<Buf>;

    const auto shouldAlloc = !buf || alpaka::getExtents(*buf)[0] < size;
    if (shouldAlloc) RAYX_VERB << (!buf ? "new alloc on device: " : "realloc on device: ") << nextPowerOfTwo(size * sizeof(Elem)) << " bytes";
    if (shouldAlloc) buf = alpaka::allocAsyncBufIfSupported<Elem, Idx>(q, nextPowerOfTwo(size));
}

template <typename Queue, typename Acc>
inline void allocRaysBuf(Queue q, const RayAttrMask attrMask, RaysBuf<Acc>& raysBuf, const int size) {
#define X(type, name, flag) \
    if (contains(attrMask, RayAttrMask::flag)) allocBuf(q, raysBuf.name, size);
    RAYX_X_MACRO_RAY_ATTR
#undef X
}

namespace BlockSizeConstraint {

struct None {};

struct Exact {
    int value;
};

struct AtLeast {
    int value;
};

struct AtMost {
    int value;
};

struct InRange {
    int atLeast;
    int atMost;
};

using Variant = std::variant<None, Exact, AtLeast, AtMost, InRange>;

}  // namespace BlockSizeConstraint

// TODO: maybe make a PR to alpaka for alpaka::Acc<Dev> to extract Acc from DevAcc (= Dev<Platform<Acc>>)
template <typename Acc, typename DevAcc, typename Queue, typename Kernel, typename... Args>
inline void execWithValidWorkDiv(DevAcc devAcc, Queue q, const int numElements, BlockSizeConstraint::Variant blockSizeConstraint,
                                 const Kernel& kernel, Args&&... args) {
    const auto conf = alpaka::KernelCfg<Acc>{
        .gridElemExtent                        = numElements,
        .threadElemExtent                      = 1,
        .blockThreadMustDivideGridThreadExtent = false,
    };

    auto workDiv = alpaka::getValidWorkDiv(conf, devAcc, kernel, std::forward<Args>(args)...);
    std::visit(
        [&]<typename BlockSizeConstraintType>(BlockSizeConstraintType constraint) {
            if constexpr (std::is_same_v<BlockSizeConstraintType, BlockSizeConstraint::Exact>) {
                assert(workDiv.m_blockThreadExtent[0] <= constraint.value && "BlockSizeConstraint::Exact exceeds the capabilities this device");
                workDiv.m_blockThreadExtent = constraint.value;
                workDiv.m_gridBlockExtent   = ceilIntDivision(numElements, constraint.value);
            }

            if constexpr (std::is_same_v<BlockSizeConstraintType, BlockSizeConstraint::AtMost>) {
                if (constraint.value < workDiv.m_blockThreadExtent[0]) {
                    workDiv.m_blockThreadExtent = constraint.value;
                    workDiv.m_gridBlockExtent   = ceilIntDivision(numElements, constraint.value);
                }
            }

            if constexpr (std::is_same_v<BlockSizeConstraintType, BlockSizeConstraint::AtLeast>) {
                assert(constraint.value <= workDiv.m_blockThreadExtent[0] && "BlockSizeConstraint::AtLeast exceeds the capabilities this device");
            }

            if constexpr (std::is_same_v<BlockSizeConstraintType, BlockSizeConstraint::InRange>) {
                assert(constraint.atLeast <= workDiv.m_blockThreadExtent[0] && "BlockSizeConstraint::InRange exceeds capabilities of this device");
                if (constraint.atMost < workDiv.m_blockThreadExtent[0]) {
                    workDiv.m_blockThreadExtent = constraint.atMost;
                    workDiv.m_gridBlockExtent   = ceilIntDivision(numElements, constraint.atMost);
                }
            }
        },
        blockSizeConstraint);

    RAYX_VERB << "execute kernel with launch config: "
              << "blocks = " << workDiv.m_gridBlockExtent[0] << ", "
              << "threads = " << workDiv.m_blockThreadExtent[0];

    alpaka::exec<Acc>(q, workDiv, kernel, std::forward<Args>(args)...);
}

}  // namespace rayx