diff --git a/src/aes/aes_ctr_prng.cpp b/src/aes/aes_ctr_prng.cpp index e8cda6e..8e8a607 100644 --- a/src/aes/aes_ctr_prng.cpp +++ b/src/aes/aes_ctr_prng.cpp @@ -1,73 +1,128 @@ +/** + * @file + * @brief High-throughput AES-CTR PRNG for nwipe using Linux AF_ALG. + * + * @details + * This translation unit implements a cryptographically strong pseudorandom + * byte stream based on AES-CTR, leveraging the Linux kernel's crypto API + * (AF_ALG) for hardware-accelerated AES (AES-NI/VAES/NEON/SVE where available). + * + * Motivation: + * - nwipe must supply multi-GB/s of random data to saturate modern NVMe/RAID. + * - User-space OpenSSL-based paths in older builds plateaued around ~250 MB/s + * on some systems due to syscall/memory-copy patterns not tuned for the + * workload. + * - The kernel provides highly optimized AES implementations and scheduling. + * + * Key properties: + * - A single AF_ALG operation socket is opened *once per thread* and reused + * for all generation calls (low syscall overhead). + * - Each generation produces a fixed-size chunk (see CHUNK) by issuing exactly + * two syscalls: `sendmsg()` (to supply IV and length) and `read()` (to fetch + * the keystream). + * - Counter management (increment) is done in user space for determinism. + * + * @warning + * IV (Counter) Encoding: + * This implementation builds the 128-bit AES-CTR IV by storing two 64-bit + * limbs in **little-endian** order (low limb at IV[0..7], high limb at + * IV[8..15]) and then incrementing the 128-bit value in little-endian form. + * This deviates from the big-endian counter semantics commonly assumed by + * RFC-style AES-CTR specifications. The stream remains secure (uniqueness + * of IVs is preserved) but is **not interoperable** with generic RFC-CTR + * streams. See `aes_ctr_prng.h` for a prominent header-level note. + * + * Threading: + * - `tls_op_fd` is thread-local; each thread owns its kernel op-socket. + * - The kernel cipher is re-entrant. No shared state in this TU is writable + * across threads. + * + * Error handling: + * - Functions return `0` on success and `-1` on failure. When underlying + * syscalls fail, `-1` is returned; callers may consult `errno` as usual. + */ + // ============================================================================================ -// aes_ctr_prng.cpp — High‑Throughput AES‑256‑CTR PRNG for nwipe -// -------------------------------------------------------------------------------------------- // WHY THIS FILE EXISTS -// -------------------- -// nwipe, a secure disk‑wiping tool, needs cryptographically strong random data at multi‑GB/s +// -------------------------------------------------------------------------------------------- +// nwipe, a secure disk-wiping tool, needs cryptographically strong random data at multi-GB/s // in order to keep up with today’s NVMe and RAID arrays. Users complained when the classic -// user‑space OpenSSL path plateaued around ~250 MB/s on modern CPUs. The Linux kernel -// already ships an extremely fast AES implementation (with transparent AES‑NI / VAES / NEON +// user-space OpenSSL path plateaued around ~250 MB/s on modern CPUs. The Linux kernel +// already ships an extremely fast AES implementation (with transparent AES-NI / VAES / NEON // acceleration) that can be accessed from user space via the AF_ALG socket family. By // delegating the heavy crypto to the kernel we gain all of the following *for free*: -// • Perfectly tuned instruction selection per CPU (AES‑NI, VAES, SVE, etc.) -// • Full cache‑line prefetch scheduling written by kernel crypto maintainers -// • Zero‑copy when the cipher runs in the same core -// • Automatic fall‑back to software if the CPU lacks AES‑NI +// • Perfectly tuned instruction selection per CPU (AES-NI, VAES, SVE, etc.) +// • Full cache-line prefetch scheduling written by kernel crypto maintainers +// • Zero-copy when the cipher runs in the same core +// • Automatic fall-back to software if the CPU lacks AES-NI // // DESIGN OVERVIEW (TL;DR) // ---------------------- // ┌─ userspace ───────────────────────────────────────────────────────────────────────────────┐ // │ +-------------------------------+ │ -// │ nwipe | aes_ctr_state_t (256 bit) | (1) initialise, store key+counter │ +// │ nwipe | aes_ctr_state_t (256 bit) | (1) initialise, store key+counter │ // │ +-------------------------------+ │ // │ │ ▲ │ -// │ │ (2) sendmsg() + read() per 16 KiB chunk │ │ +// │ │ (2) sendmsg() + read() per fixed-size chunk │ │ // └─────────────────────┼───────────────────────────────────────────────────────────┤ kernel │ // │ │ space │ // persistent FD ▼ │ │ -// ┌──────────────────────┐ │ │ -// │ AF_ALG op socket │ (ctr(aes)) │ │ -// └──────────────────────┘ └─────────┘ +// ┌──────────────────────┐ │ │ +// │ AF_ALG op socket │ (ctr(aes)) │ │ +// └──────────────────────┘ └─────────┘ // -// Key idea: **The socket is opened once** (in aes_ctr_prng_init) and kept open for the entire -// lifetime of the process. Each PRNG call only needs two inexpensive syscalls: -// • sendmsg() — tells the kernel the IV (i.e. current counter) + plaintext length -// • read() — returns the ciphertext (= keystream) into our output buffer -// That is less overhead than memcpy() at these block sizes. -// -// PUBLIC STATE (aes_ctr_state_t) REMAINS 256 bit -// --------------------------------------------- -// We consciously do *NOT* fold the file descriptor into the public state because that would -// destroy ABI compatibility with libnwipe. Instead, g_op_fd below is TU‑local (file‑static). -// Multiple independent PRNG instances *share* this socket — fine for nwipe’s single thread. -// -// SAFETY / THREADING -// ------------------ -// • The kernel cipher itself is re-entrant; thread-local FD guarantees call-site safety. -// • Counter increment (`ctr_add`) is done entirely in user space; no atomic ops needed because -// each thread owns its own `aes_ctr_state_t` instance. +// Public ABI stability: +// --------------------- +// The fd is *not* part of the public state (preserves libnwipe ABI). A TU-local, +// thread-local descriptor is used internally; multiple PRNG instances per thread +// share the same op-socket as needed. // +// Safety / threading: +// ------------------- +// • The kernel cipher is re-entrant. Thread-local fd avoids cross-thread hazards. +// • Counter increments occur in user space; one aes_ctr_state_t per thread. // ============================================================================================== #include "aes_ctr_prng.h" // public header (256-bit state, extern "C" API) #include // socket(), bind(), accept(), sendmsg() -#include // AF_ALG constants +#include // AF_ALG constants and skcipher API #include // read(), close() #include // memcpy(), memset(), strcpy() #include // std::array for control buffer +// ---------------------------------------------------------------------------------------------- +// CONFIGURABLE CHUNK SIZE +// ---------------------------------------------------------------------------------------------- +// The per-call output size (CHUNK) can be configured at compile time via +// AES_CTR_PRNG_CHUNK_BYTES. Default is 128 KiB. +// Example: +// gcc -DAES_CTR_PRNG_CHUNK_BYTES="(64u*1024u)" ... +// ---------------------------------------------------------------------------------------------- +#ifndef AES_CTR_PRNG_CHUNK_BYTES +#define AES_CTR_PRNG_CHUNK_BYTES (128u * 1024u) // 128 KiB default +#endif + // ---------------------------------------------------------------------------------------------- // GLOBAL 256-BIT KEY -// ---------------------------------------------------------------------------------------------- -// • Loaded from the user-supplied seed in aes_ctr_prng_init(). -// • Constant for the lifetime of the process. -// • Exposed (non-static) so unit tests in another TU can verify it. +// ---------------------------------------------------------------------------------------------- +// • Loaded from user-supplied seed in aes_ctr_prng_init(). +// • Intended to remain constant for the process lifetime (or until re-init). +// • Exposed (non-static) so out-of-TU tests can assert correct key handling. +// +// @note Consider zeroizing on shutdown to avoid key retention in core dumps. +// ---------------------------------------------------------------------------------------------- unsigned char global_key[32]; // ---------------------------------------------------------------------------------------------- // THREAD-LOCAL OPERATION SOCKET (one per nwipe thread) -// ---------------------------------------------------------------------------------------------- -// Portable TLS qualifier: C++11 `thread_local` or GCC/Clang `__thread` for C compilation. +// ---------------------------------------------------------------------------------------------- +// Portable TLS qualifier: C++11 `thread_local` or GCC/Clang `__thread` for C builds. +// +// @invariant tls_op_fd == -1 ⇒ this thread has not opened the op-socket yet. +// tls_op_fd >= 0 ⇒ valid AF_ALG operation socket for "ctr(aes)". +// +// @thread_safety Thread-local; no synchronization required. +// ---------------------------------------------------------------------------------------------- #if defined(__cplusplus) && __cplusplus >= 201103L #define PRNG_THREAD_LOCAL thread_local #else @@ -77,32 +132,71 @@ unsigned char global_key[32]; PRNG_THREAD_LOCAL static int tls_op_fd = -1; // -1 ⇒ not yet opened in this thread // ---------------------------------------------------------------------------------------------- -// CONSTANTS +// CONSTANTS / INTERNAL HELPERS // ---------------------------------------------------------------------------------------------- namespace { -constexpr std::size_t CHUNK = 1u << 14; // 16 KiB produced per kernel call -constexpr std::size_t AES_BLOCK = 16u; // fixed by AES spec -constexpr std::size_t BLOCKS_PER_CHUNK = CHUNK / AES_BLOCK; // 1024 CTR blocks +/** + * @brief AES block size in bytes (by specification). + */ +constexpr std::size_t AES_BLOCK = 16u; -// Little-endian 64-bit store helper. +/** + * @brief Fixed-size generation granularity per kernel call. + * @details + * Adjust at build time via AES_CTR_PRNG_CHUNK_BYTES to balance syscall + * overhead vs. latency and memory traffic. + * Typical values: 16 KiB (legacy default), 64 KiB, 128 KiB. + */ +constexpr std::size_t CHUNK = AES_CTR_PRNG_CHUNK_BYTES; + +static_assert(CHUNK % AES_BLOCK == 0, + "AES_CTR_PRNG_CHUNK_BYTES must be a multiple of AES_BLOCK (16 bytes)"); + +/// Number of AES-CTR blocks produced per CHUNK. +constexpr std::size_t BLOCKS_PER_CHUNK = CHUNK / AES_BLOCK; + +/** + * @brief Store a 64-bit integer in little-endian byte order. + * + * @param v 64-bit value. + * @param buf Destination pointer; must point to at least 8 writable bytes. + * + * @note + * This function enforces a little-endian layout regardless of host endianness. + * For hot paths you may consider an optimized version using memcpy/bswap on + * big-endian hosts instead of byte-wise stores. + */ static inline void store64_le(uint64_t v, unsigned char *buf) { for (int i = 0; i < 8; ++i) buf[i] = static_cast(v >> (8 * i)); } -// ============================================================================================== -// ControlBuilder — assembles the msghdr + control messages for AF_ALG -// ============================================================================================== -// • Control message #1 ALG_SET_OP = ALG_OP_ENCRYPT -// • Control message #2 ALG_SET_IV = 128-bit IV (our counter) -// • Data iovec points to `plain` (all-zero buffer, length CHUNK) -// -// Everything lives on the stack, so constructing ControlBuilder is basically free. -// +/** + * @class ControlBuilder + * @brief Helper to assemble `msghdr` and control messages for AF_ALG. + * + * @details + * Builds the control payload for one `sendmsg()` call against an AF_ALG + * skcipher operation socket: + * - Control message #1: `ALG_SET_OP = ALG_OP_ENCRYPT` + * - Control message #2: `ALG_SET_IV` with a 128-bit IV + * - Data iovec: points at the plaintext buffer (here: zero-bytes of length CHUNK) + * + * All data structures live on the stack; constructing this helper is O(1). + * + * @note + * The kernel expects `ivlen` as a host-endian u32 followed by `iv` bytes. + * "Network order not required" is intentional for AF_ALG skcipher IVs. + */ class ControlBuilder { public: + /** + * @param iv 128-bit IV (counter value), passed as 16 bytes. + * @param plain Pointer to plaintext buffer (here: all-zero array). + * @param len Plaintext length in bytes; determines keystream length. + */ ControlBuilder(const unsigned char iv[16], void *plain, size_t len) { // ---------- Data iovec ---------- @@ -110,7 +204,7 @@ public: iov_.iov_len = len; // ---------- msghdr -------------- - msg_.msg_name = nullptr; // already bound + msg_.msg_name = nullptr; // already bound via bind() msg_.msg_namelen = 0; msg_.msg_iov = &iov_; msg_.msg_iovlen = 1; @@ -130,15 +224,16 @@ public: c2->cmsg_level = SOL_ALG; c2->cmsg_type = ALG_SET_IV; c2->cmsg_len = CMSG_LEN(sizeof(uint32_t) + 16); - uint32_t ivlen = 16; // network order not required + uint32_t ivlen = 16; // host endian; not network order std::memcpy(CMSG_DATA(c2), &ivlen, sizeof(ivlen)); std::memcpy(CMSG_DATA(c2) + sizeof(ivlen), iv, 16); } + /// @return Prepared msghdr suitable for `sendmsg()`. struct msghdr *msg() { return &msg_; } private: - // Enough space for both control messages. + // Control buffer sufficient for both control messages. std::array control_{}; @@ -146,9 +241,21 @@ private: struct iovec iov_{}; }; -// ---------------------------------------------------------------------------------------------- -// open_ctr_socket() — perform socket → bind → setsockopt → accept sequence -// ---------------------------------------------------------------------------------------------- +/** + * @brief Open a "ctr(aes)" skcipher operation socket via AF_ALG. + * + * @details + * Performs the `socket()` → `bind()` → `setsockopt(ALG_SET_KEY)` → `accept()` + * sequence. The returned fd is the operation socket used for `sendmsg()`+`read()`. + * + * @param key AES key (32 bytes for AES-256). + * @return Operation socket fd (>= 0) on success, or -1 on failure. + * + * @warning + * This function does not set `FD_CLOEXEC` nor handle `SIGPIPE`. Consider using + * `SOCK_CLOEXEC` on `socket()` and `accept4()` and `MSG_NOSIGNAL` on `sendmsg()` + * in hardened builds. + */ static int open_ctr_socket(const unsigned char key[32]) { // 1. Create transform socket (AF_ALG family). @@ -176,7 +283,20 @@ static int open_ctr_socket(const unsigned char key[32]) return op; // may be -1 on error } -// Increment 128-bit counter by n blocks (little-endian addition). +/** + * @brief Increment a 128-bit little-endian counter by @p n AES blocks. + * + * @details + * The counter is represented as two 64-bit little-endian limbs in state->s[0..1]. + * The increment is performed modulo 2^128 with carry propagation from low to high. + * + * @param st PRNG state with s[0]=lo, s[1]=hi limbs. + * @param n Number of AES blocks to add. + * + * @note + * This is **little-endian** counter arithmetic; see the big file-level warning + * about non-RFC CTR semantics. + */ static void ctr_add(aes_ctr_state_t *st, uint64_t n) { uint64_t old = st->s[0]; @@ -191,11 +311,27 @@ static void ctr_add(aes_ctr_state_t *st, uint64_t n) // ================================================================================================= extern "C" { -// ----------------------------------------------------------------------------------------------- -// aes_ctr_prng_init() -// • Clears state, copies first 128 bits of seed into counter, saves 256-bit key globally. -// • Lazily opens thread-local AF_ALG socket. -// ----------------------------------------------------------------------------------------------- +/** + * @brief Initialize PRNG state and lazily open the per-thread AF_ALG socket. + * + * @param[out] state Pointer to PRNG state (must be non-null). + * @param[in] init_key Seed as an array of unsigned long; must provide >= 32 bytes. + * @param[in] key_length Number of `unsigned long` words in @p init_key. + * + * @retval 0 Success. + * @retval -1 Invalid parameters or AF_ALG setup failure. + * + * @details + * - Zeroes the entire state and copies the first 128 bits of the seed into the + * 128-bit counter (little-endian limb order). + * - Saves the first 256 bits as the AES-256 key in @c global_key. + * - Opens the AF_ALG operation socket for "ctr(aes)" on first call in this + * thread and stores the fd in thread-local storage. + * + * @warning + * The chosen IV scheme is little-endian and not RFC-interoperable. + * Do not mix with external AES-CTR generators expecting big-endian counters. + */ int aes_ctr_prng_init(aes_ctr_state_t *state, unsigned long init_key[], unsigned long key_length) @@ -203,11 +339,11 @@ int aes_ctr_prng_init(aes_ctr_state_t *state, if (!state || !init_key || key_length * sizeof(unsigned long) < 32) return -1; - // Zero entire state, then put seed[0..15] into counter. + // Zero entire state, then put seed[0..15] into counter (LE limbs). std::memset(state, 0, sizeof(*state)); std::memcpy(state->s, init_key, sizeof(uint64_t) * 2); - // Remember full key for possible re-opens. + // Remember full AES-256 key (32 bytes) for possible re-opens. std::memcpy(global_key, init_key, 32); // Open per-thread socket on first call in this thread. @@ -218,18 +354,37 @@ int aes_ctr_prng_init(aes_ctr_state_t *state, return 0; } -// ----------------------------------------------------------------------------------------------- -// aes_ctr_prng_genrand_16k_to_buf() -// • Hot path: produces exactly 16 KiB of keystream in `bufpos`. -// • Only two syscalls thanks to persistent thread-local socket. -// ----------------------------------------------------------------------------------------------- -int aes_ctr_prng_genrand_16k_to_buf(aes_ctr_state_t *state, +/** + * @brief Produce exactly CHUNK bytes of keystream into @p bufpos. + * + * @param[in] state PRNG state (counter source). + * @param[out] bufpos Destination buffer; must hold at least CHUNK bytes. + * + * @retval 0 Success (CHUNK bytes written). + * @retval -1 Parameter error or syscall failure. + * + * @details + * Sequence per call: + * 1. Assemble a 128-bit IV by storing s[0] (low) and s[1] (high) as + * little-endian 64-bit words into a 16-byte buffer. + * 2. Build the AF_ALG control message (ALG_SET_OP=ENCRYPT, ALG_SET_IV=IV) + * and data iovec pointing to a static all-zero plaintext of length CHUNK. + * 3. `sendmsg()` the request and `read()` back exactly CHUNK bytes of + * ciphertext — which, because plaintext is zero, equals the keystream. + * 4. Increment the 128-bit counter by `BLOCKS_PER_CHUNK`. + * + * @note + * The zero-plaintext buffer is static and zero-initialized once; the kernel + * will not read uninitialized memory. Using zero plaintext is standard for + * obtaining the raw AES-CTR keystream. + */ +int aes_ctr_prng_genrand_128k_to_buf(aes_ctr_state_t *state, unsigned char *bufpos) { if (!state || !bufpos || tls_op_fd < 0) return -1; - // --- Construct 128-bit IV from counter ------------------------------------ + // --- Construct 128-bit IV from counter (little-endian limbs) ------------- unsigned char iv[16]; store64_le(state->s[0], iv); // little-endian low limb store64_le(state->s[1], iv + 8); // little-endian high limb @@ -247,10 +402,16 @@ int aes_ctr_prng_genrand_16k_to_buf(aes_ctr_state_t *state, return 0; } -// ----------------------------------------------------------------------------------------------- -// aes_ctr_prng_shutdown() -// • Optional cleanup helper (kernel will close FDs at process exit anyway). -// ----------------------------------------------------------------------------------------------- +/** + * @brief Optional cleanup helper (explicitly closes the per-thread op-socket). + * + * @retval 0 Always succeeds. + * + * @details + * The kernel will close FDs at process exit, but explicit shutdown helps + * test harnesses and avoids keeping descriptors alive across exec(). + * Consider zeroizing @c global_key here for defense-in-depth. + */ int aes_ctr_prng_shutdown(void) { if (tls_op_fd >= 0) { @@ -260,5 +421,5 @@ int aes_ctr_prng_shutdown(void) return 0; } -} // extern \"C\" +} // extern "C" diff --git a/src/aes/aes_ctr_prng.h b/src/aes/aes_ctr_prng.h index edbf195..940a225 100644 --- a/src/aes/aes_ctr_prng.h +++ b/src/aes/aes_ctr_prng.h @@ -38,12 +38,12 @@ int aes_ctr_prng_init(aes_ctr_state_t *state, unsigned long init_key[], unsigned long key_length); -/* Generate one 16 KiB chunk of random data into bufpos. +/* Generate one 128 KiB chunk of random data into bufpos. * * Returns 0 on success, -1 on failure. * Uses the persistent AF_ALG socket. */ -int aes_ctr_prng_genrand_16k_to_buf(aes_ctr_state_t *state, +int aes_ctr_prng_genrand_128k_to_buf(aes_ctr_state_t *state, unsigned char *bufpos); /* Optional: Close the persistent AF_ALG socket at program shutdown. diff --git a/src/prng.c b/src/prng.c index 71832f6..0065824 100644 --- a/src/prng.c +++ b/src/prng.c @@ -346,72 +346,232 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ) } /** - * Initialize the AES-CTR PRNG state. + * @brief Initialize the AES-CTR PRNG state for this thread. * - * Signature: int nwipe_aes_ctr_prng_init(NWIPE_PRNG_INIT_SIGNATURE); + * @details + * Initializes the thread-local PRNG based on the supplied seed and resets the + * ring-buffer prefetch cache. The underlying AES-CTR implementation uses a + * persistent AF_ALG operation socket per thread, opened lazily by + * aes_ctr_prng_init(). The public state only stores a 128-bit counter while + * the kernel keeps the expanded AES key schedule. * - * - Allocates state if *state is NULL. - * - Calls underlying aes_ctr_prng_init() with provided seed. - * - Logs errors on failure. - */ -/* - * high‑throughput wrapper with pre‑fetch buffer - * -------------------------------------------------------------------------- - * Provides NWIPE_PRNG_INIT / NWIPE_PRNG_READ glue around the persistent - * kernel‑AES PRNG. Adds a 64 KiB stash buffer so that typical small requests - * from nwipe (e.g. 32 B, 512 B) do **not** trigger a syscall each time. + * @param[in,out] state Pointer to an opaque PRNG state handle. If `*state` is + * `NULL`, this function allocates it with `calloc()`. + * @param[in] seed Seed material (must contain at least 32 bytes). + * @param[in] ... Remaining parameters as defined by NWIPE_PRNG_INIT_SIGNATURE. + * + * @note + * The ring is intentionally left empty to keep init fast. Callers may choose to + * "prefill" by invoking refill_stash_thread_local(*state, SIZE_OF_AES_CTR_PRNG) + * once to amortize first-use latency for tiny reads. + * + * @retval 0 Success. + * @retval -1 Allocation or initialization failure (already logged). */ -/* Thread‑local specifier that works in C11 and GNU C */ +/* + * High-throughput wrapper with a thread-local ring-buffer prefetch + * ---------------------------------------------------------------- + * This glue layer implements NWIPE_PRNG_INIT / NWIPE_PRNG_READ around the + * persistent kernel-AES PRNG. It maintains a lock-free, thread-local ring + * buffer ("stash") that caches keystream blocks produced in fixed-size chunks + * (SIZE_OF_AES_CTR_PRNG; e.g., 16 KiB or 256 KiB). + * + * Rationale: + * - Nwipe frequently requests small slices (e.g., 32 B, 512 B, 4 KiB). Issuing + * one kernel call per small read would be syscall- and copy-bound. + * - By fetching larger chunks and serving small reads from the ring buffer, + * we reduce syscall rate and memory traffic and approach memcpy-limited + * throughput on modern CPUs with AES acceleration. + * + * Why a ring buffer (over a linear stash + memmove): + * - No O(n) memmove() when the buffer fills with a tail of unread bytes. + * - Constant-time head/tail updates via modulo arithmetic. + * - Better cache locality and fewer TLB/cache misses; improved prefetching. + */ + +/** @def NW_THREAD_LOCAL + * @brief Portable thread-local specifier for C11 and GNU C. + * + * The ring buffer and its indices are thread-local, so no synchronization + * (locks/atomics) is required. Do not share this state across threads. + */ #if defined( __STDC_VERSION__ ) && __STDC_VERSION__ >= 201112L #define NW_THREAD_LOCAL _Thread_local #else #define NW_THREAD_LOCAL __thread #endif -/* ------------------------------------------------------------------------- - * Thread‑local stash implementation - * ------------------------------------------------------------------------- */ -NW_THREAD_LOCAL static unsigned char stash[STASH_CAPACITY]; -NW_THREAD_LOCAL static size_t stash_pos = 0; /* next unread byte */ -NW_THREAD_LOCAL static size_t stash_valid = 0; /* bytes currently in stash */ +/** @def NW_ALIGN + * @brief Minimal alignment helper for hot buffers/structures. + * + * 64-byte alignment targets typical cacheline boundaries to reduce false + * sharing and improve hardware prefetch effectiveness for linear scans. + */ +#if defined( __GNUC__ ) || defined( __clang__ ) +#define NW_ALIGN( N ) __attribute__( ( aligned( N ) ) ) +#else +#define NW_ALIGN( N ) _Alignas( N ) +#endif -/* Ensure at least `need` bytes are available in the stash. - * Returns 0 on success, -1 on PRNG failure. */ +/** + * @def STASH_CAPACITY + * @brief Ring capacity in bytes (power-of-two; multiple of CHUNK). + * + * @details + * Defaults to 1 MiB. Must be: + * - a power of two (allows modulo via bitmask), + * - a multiple of SIZE_OF_AES_CTR_PRNG, so each produced chunk fits whole. + * + * @note + * Practical choices: 512 KiB … 4 MiB depending on CHUNK size and workload. + * For SIZE_OF_AES_CTR_PRNG = 256 KiB, 1 MiB yields four in-flight chunks and + * works well for nwipe’s small-read patterns. + */ +#ifndef STASH_CAPACITY +#define STASH_CAPACITY ( 1u << 20 ) /* 1 MiB */ +#endif + +#if defined( __STDC_VERSION__ ) && __STDC_VERSION__ >= 201112L +_Static_assert( ( STASH_CAPACITY & ( STASH_CAPACITY - 1 ) ) == 0, "STASH_CAPACITY must be a power of two" ); +_Static_assert( ( STASH_CAPACITY % SIZE_OF_AES_CTR_PRNG ) == 0, + "STASH_CAPACITY must be a multiple of SIZE_OF_AES_CTR_PRNG" ); +#endif + +/** @brief Thread-local ring buffer storage for prefetched keystream. */ +NW_THREAD_LOCAL static unsigned char stash[STASH_CAPACITY] NW_ALIGN( 64 ); + +/** + * @name Ring indices (thread-local) + * @{ + * @var rb_head Next read position (consumer cursor). + * @var rb_tail Next write position (producer cursor). + * @var rb_count Number of valid bytes currently stored. + * + * @invariant + * - 0 <= rb_count <= STASH_CAPACITY + * - rb_head, rb_tail in [0, STASH_CAPACITY) + * - (rb_tail - rb_head) mod STASH_CAPACITY == rb_count + * + * @warning + * These variables are TLS and must not be accessed from or shared with other + * threads. One PRNG instance per thread. + * @} + */ +NW_THREAD_LOCAL static size_t rb_head = 0; /* next byte to read */ +NW_THREAD_LOCAL static size_t rb_tail = 0; /* next byte to write */ +NW_THREAD_LOCAL static size_t rb_count = 0; /* occupied bytes */ + +/** + * @brief Free space available in the ring (bytes). + * @return Number of free bytes (0 … STASH_CAPACITY). + */ +static inline size_t rb_free( void ) +{ + return STASH_CAPACITY - rb_count; +} + +/** + * @brief Contiguous readable bytes starting at @c rb_head (no wrap). + * @return Number of contiguous bytes available to read without split memcpy. + */ +static inline size_t rb_contig_used( void ) +{ + size_t to_end = STASH_CAPACITY - rb_head; + return ( rb_count < to_end ) ? rb_count : to_end; +} + +/** + * @brief Contiguous writable bytes starting at @c rb_tail (no wrap). + * @return Number of contiguous bytes available to write without wrap. + */ +static inline size_t rb_contig_free( void ) +{ + size_t to_end = STASH_CAPACITY - rb_tail; + size_t free = rb_free(); + return ( free < to_end ) ? free : to_end; +} + +/** + * @brief Ensure at least @p need bytes are buffered in the ring. + * + * @details + * Production model: + * - The kernel PRNG produces keystream in fixed-size chunks + * (SIZE_OF_AES_CTR_PRNG bytes; e.g., 16 KiB or 256 KiB). + * - We only ever append *whole* chunks. If total free space is less than one + * chunk, no production occurs (non-blocking style); the caller should first + * consume data and try again. + * + * Wrap handling: + * - Fast path: if a contiguous free region of at least one chunk exists at + * @c rb_tail, generate directly into @c stash + rb_tail (zero extra copies). + * - Wrap path: otherwise, generate one chunk into a small temporary buffer and + * split-copy into [rb_tail..end) and [0..rest). This case is infrequent and + * still cheaper than memmoving ring contents. + * + * @param[in] state Pointer to the AES-CTR state (per-thread). + * @param[in] need Minimum number of bytes the caller would like to have ready. + * + * @retval 0 Success (or no space to produce yet). + * @retval -1 PRNG failure (aes_ctr_prng_genrand_128k_to_buf() error). + * + * @warning + * Thread-local only. Do not call concurrently from multiple threads that share + * the same TLS variables. + */ static int refill_stash_thread_local( void* state, size_t need ) { - while( stash_valid - stash_pos < need ) + while( rb_count < need ) { - /* If buffer empty, reset indices to front. */ - if( stash_pos == stash_valid ) - { - stash_pos = stash_valid = 0; - } + /* Not enough total free space for a full CHUNK → let the caller read first. */ + if( rb_free() < SIZE_OF_AES_CTR_PRNG ) + break; - /* Ensure there is space for next 16 KiB chunk. */ - if( stash_valid + SIZE_OF_AES_CTR_PRNG > STASH_CAPACITY ) + size_t cf = rb_contig_free(); + if( cf >= SIZE_OF_AES_CTR_PRNG ) { - /* Slide remaining unread bytes to front. */ - size_t remaining = stash_valid - stash_pos; - memmove( stash, stash + stash_pos, remaining ); - stash_pos = 0; - stash_valid = remaining; + /* Fast path: generate straight into the ring. */ + if( aes_ctr_prng_genrand_128k_to_buf( (aes_ctr_state_t*) state, stash + rb_tail ) != 0 ) + return -1; + rb_tail = ( rb_tail + SIZE_OF_AES_CTR_PRNG ) & ( STASH_CAPACITY - 1 ); + rb_count += SIZE_OF_AES_CTR_PRNG; } - - /* Generate another 16 KiB of keystream. */ - if( aes_ctr_prng_genrand_16k_to_buf( (aes_ctr_state_t*) state, stash + stash_valid ) != 0 ) + else { - return -1; + /* Wrap path: temporary production, then split-copy. */ + unsigned char tmp[SIZE_OF_AES_CTR_PRNG]; + if( aes_ctr_prng_genrand_128k_to_buf( (aes_ctr_state_t*) state, tmp ) != 0 ) + return -1; + size_t first = STASH_CAPACITY - rb_tail; /* bytes to physical end */ + memcpy( stash + rb_tail, tmp, first ); + memcpy( stash, tmp + first, SIZE_OF_AES_CTR_PRNG - first ); + rb_tail = ( rb_tail + SIZE_OF_AES_CTR_PRNG ) & ( STASH_CAPACITY - 1 ); + rb_count += SIZE_OF_AES_CTR_PRNG; } - stash_valid += SIZE_OF_AES_CTR_PRNG; } return 0; } /* ---------------- PRNG INIT ---------------- */ + +/** + * @brief Thread-local initialization wrapper around @c aes_ctr_prng_init(). + * + * @param[in,out] state Address of the caller’s PRNG state pointer. If `*state` + * is `NULL`, this function allocates one `aes_ctr_state_t`. + * @param[in] seed Seed descriptor as defined by NWIPE_PRNG_INIT_SIGNATURE. + * + * @retval 0 Success. + * @retval -1 Allocation or backend initialization failure (logged). + * + * @note + * Resets the ring buffer to empty. Consider a one-time prefill if your workload + * is dominated by tiny reads. + */ int nwipe_aes_ctr_prng_init( NWIPE_PRNG_INIT_SIGNATURE ) { - nwipe_log( NWIPE_LOG_NOTICE, "Initializing AES‑CTR PRNG (thread‑local stash)" ); + nwipe_log( NWIPE_LOG_NOTICE, "Initializing AES-CTR PRNG (thread-local ring buffer)" ); if( *state == NULL ) { @@ -431,34 +591,93 @@ int nwipe_aes_ctr_prng_init( NWIPE_PRNG_INIT_SIGNATURE ) return -1; } - /* Reset this thread's stash */ - stash_pos = stash_valid = 0; + /* Reset ring to empty. */ + rb_head = rb_tail = rb_count = 0; return 0; } /* ---------------- PRNG READ ---------------- */ + +/** + * @brief Copy @p count bytes of keystream into @p buffer. + * + * @details + * Strategy: + * - If the request is "large" (>= CHUNK) and the ring is empty, use the + * direct-fill fast path and generate full CHUNKs directly into the output + * buffer to avoid an extra memcpy. + * - Otherwise, serve from the ring: + * * Ensure at least one byte is available via @c refill_stash_thread_local + * (non-blocking; production occurs only if one full CHUNK fits). + * * Copy the largest contiguous block starting at @c rb_head. + * * Opportunistically prefetch when sufficient free space exists to keep + * latency low for upcoming small reads. + * + * @param[out] buffer Destination buffer to receive keystream. + * @param[in] count Number of bytes to generate and copy. + * @param[in] ... Remaining parameters as defined by NWIPE_PRNG_READ_SIGNATURE. + * + * @retval 0 Success (exactly @p count bytes written). + * @retval -1 Backend/IO failure (already logged). + * + * @warning + * Per-thread API: do not share this state across threads. + */ int nwipe_aes_ctr_prng_read( NWIPE_PRNG_READ_SIGNATURE ) { unsigned char* out = buffer; size_t bytes_left = count; - while( bytes_left > 0 ) + /* Fast path: for large reads, bypass the ring if currently empty. + * Generate full CHUNKs directly into the destination to save one memcpy. */ + while( bytes_left >= SIZE_OF_AES_CTR_PRNG && rb_count == 0 ) { - /* Refill stash if necessary. */ - if( refill_stash_thread_local( *state, 1 ) != 0 ) + if( aes_ctr_prng_genrand_128k_to_buf( (aes_ctr_state_t*) *state, out ) != 0 ) { - nwipe_log( NWIPE_LOG_ERROR, "PRNG refill failed" ); + nwipe_log( NWIPE_LOG_ERROR, "PRNG direct fill failed" ); return -1; } + out += SIZE_OF_AES_CTR_PRNG; + bytes_left -= SIZE_OF_AES_CTR_PRNG; + } - /* Copy as much as possible from stash to user buffer. */ - size_t available = stash_valid - stash_pos; - size_t chunk = ( bytes_left < available ) ? bytes_left : available; + /* General path: serve from ring, refilling as needed. */ + while( bytes_left > 0 ) + { + /* Ensure at least one byte is available for tiny reads. Refill only + * produces if a full CHUNK fits; otherwise we try again once consumer + * progress frees enough space. */ + if( rb_count == 0 ) + { + if( refill_stash_thread_local( *state, 1 ) != 0 ) + { + nwipe_log( NWIPE_LOG_ERROR, "PRNG refill failed" ); + return -1; + } + if( rb_count == 0 ) + continue; /* still no room for a CHUNK yet */ + } - memcpy( out, stash + stash_pos, chunk ); - stash_pos += chunk; - out += chunk; - bytes_left -= chunk; + /* Copy the largest contiguous span starting at rb_head. */ + size_t avail = rb_contig_used(); + size_t take = ( bytes_left < avail ) ? bytes_left : avail; + + memcpy( out, stash + rb_head, take ); + + rb_head = ( rb_head + take ) & ( STASH_CAPACITY - 1 ); + rb_count -= take; + out += take; + bytes_left -= take; + + /* Opportunistic prefetch to hide latency of future small reads. */ + if( rb_free() >= ( 2 * SIZE_OF_AES_CTR_PRNG ) ) + { + if( refill_stash_thread_local( *state, SIZE_OF_AES_CTR_PRNG ) != 0 ) + { + nwipe_log( NWIPE_LOG_ERROR, "PRNG opportunistic refill failed" ); + return -1; + } + } } return 0; } diff --git a/src/prng.h b/src/prng.h index 2f13fc4..30e9047 100644 --- a/src/prng.h +++ b/src/prng.h @@ -80,8 +80,10 @@ int nwipe_aes_ctr_prng_read( NWIPE_PRNG_READ_SIGNATURE ); /* Size of the XOROSHIRO-256 is not derived from the architecture, but it is strictly 32 bytes */ #define SIZE_OF_XOROSHIRO256_PRNG 32 -/* Size of the AES-CTR is not derived from the architecture, but it is strictly 16k bytes */ -#define SIZE_OF_AES_CTR_PRNG 16384u -#define STASH_CAPACITY 65536u /* 64 KiB local pre‑fetch buffer */ +/* AES-CTR generation chunk size: fixed 128 KiB (not architecture-dependent) */ +#define SIZE_OF_AES_CTR_PRNG ( 128 * 1024 ) + +/* Thread-local prefetch ring buffer capacity: 1 MiB */ +#define STASH_CAPACITY ( 1024 * 1024 ) #endif /* PRNG_H_ */