Merge pull request #660 from Knogle/kernel-aes-ni

Implement high-performance AES-256-CTR PRNG via Linux kernel AF_ALG socket
This commit is contained in:
PartialVolume
2025-11-26 21:15:17 +00:00
committed by GitHub
8 changed files with 949 additions and 6 deletions

View File

@@ -11,6 +11,7 @@ AC_CONFIG_HEADERS([config.h])
# Checks for programs.
AC_PROG_CC
AC_PROG_CXX
PKG_PROG_PKG_CONFIG
# Checks for libraries.

View File

@@ -6,5 +6,5 @@ AM_LDFLAGS =
# this lists the binaries to produce, the (non-PHONY, binary) targets in
# the previous manual Makefile
bin_PROGRAMS = nwipe
nwipe_SOURCES = context.h logging.h options.h prng.h version.h temperature.h nwipe.c gui.c method.h pass.c device.c gui.h isaac_rand/isaac_standard.h isaac_rand/isaac_rand.h isaac_rand/isaac_rand.c isaac_rand/isaac64.h isaac_rand/isaac64.c mt19937ar-cok/mt19937ar-cok.c nwipe.h mt19937ar-cok/mt19937ar-cok.h alfg/add_lagg_fibonacci_prng.h alfg/add_lagg_fibonacci_prng.c xor/xoroshiro256_prng.h xor/xoroshiro256_prng.c pass.h device.h logging.c method.c options.c prng.c version.c temperature.c PDFGen/pdfgen.h PDFGen/pdfgen.c create_pdf.c create_pdf.h embedded_images/shred_db.jpg.c embedded_images/shred_db.jpg.h embedded_images/tick_erased.jpg.c embedded_images/tick_erased.jpg.h embedded_images/redcross.c embedded_images/redcross.h hpa_dco.h hpa_dco.c miscellaneous.h miscellaneous.c embedded_images/nwipe_exclamation.jpg.h embedded_images/nwipe_exclamation.jpg.c conf.h conf.c customers.h customers.c hddtemp_scsi/hddtemp.h hddtemp_scsi/scsi.h hddtemp_scsi/scsicmds.h hddtemp_scsi/get_scsi_temp.c hddtemp_scsi/scsi.c hddtemp_scsi/scsicmds.c
nwipe_SOURCES = context.h logging.h options.h prng.h version.h temperature.h nwipe.c gui.c method.h pass.c device.c gui.h isaac_rand/isaac_standard.h isaac_rand/isaac_rand.h isaac_rand/isaac_rand.c isaac_rand/isaac64.h isaac_rand/isaac64.c mt19937ar-cok/mt19937ar-cok.c nwipe.h mt19937ar-cok/mt19937ar-cok.h alfg/add_lagg_fibonacci_prng.h alfg/add_lagg_fibonacci_prng.c xor/xoroshiro256_prng.h xor/xoroshiro256_prng.c aes/aes_ctr_prng.h aes/aes_ctr_prng.cpp pass.h device.h logging.c method.c options.c prng.c version.c temperature.c PDFGen/pdfgen.h PDFGen/pdfgen.c create_pdf.c create_pdf.h embedded_images/shred_db.jpg.c embedded_images/shred_db.jpg.h embedded_images/tick_erased.jpg.c embedded_images/tick_erased.jpg.h embedded_images/redcross.c embedded_images/redcross.h hpa_dco.h hpa_dco.c miscellaneous.h miscellaneous.c embedded_images/nwipe_exclamation.jpg.h embedded_images/nwipe_exclamation.jpg.c conf.h conf.c customers.h customers.c hddtemp_scsi/hddtemp.h hddtemp_scsi/scsi.h hddtemp_scsi/scsicmds.h hddtemp_scsi/get_scsi_temp.c hddtemp_scsi/scsi.c hddtemp_scsi/scsicmds.c
nwipe_LDADD = $(PARTED_LIBS) $(LIBCONFIG)

425
src/aes/aes_ctr_prng.cpp Normal file
View File

@@ -0,0 +1,425 @@
/**
* @file
* @brief High-throughput AES-CTR PRNG for nwipe using Linux AF_ALG.
*
* @details
* This translation unit implements a cryptographically strong pseudorandom
* byte stream based on AES-CTR, leveraging the Linux kernel's crypto API
* (AF_ALG) for hardware-accelerated AES (AES-NI/VAES/NEON/SVE where available).
*
* Motivation:
* - nwipe must supply multi-GB/s of random data to saturate modern NVMe/RAID.
* - User-space OpenSSL-based paths in older builds plateaued around ~250 MB/s
* on some systems due to syscall/memory-copy patterns not tuned for the
* workload.
* - The kernel provides highly optimized AES implementations and scheduling.
*
* Key properties:
* - A single AF_ALG operation socket is opened *once per thread* and reused
* for all generation calls (low syscall overhead).
* - Each generation produces a fixed-size chunk (see CHUNK) by issuing exactly
* two syscalls: `sendmsg()` (to supply IV and length) and `read()` (to fetch
* the keystream).
* - Counter management (increment) is done in user space for determinism.
*
* @warning
* IV (Counter) Encoding:
* This implementation builds the 128-bit AES-CTR IV by storing two 64-bit
* limbs in **little-endian** order (low limb at IV[0..7], high limb at
* IV[8..15]) and then incrementing the 128-bit value in little-endian form.
* This deviates from the big-endian counter semantics commonly assumed by
* RFC-style AES-CTR specifications. The stream remains secure (uniqueness
* of IVs is preserved) but is **not interoperable** with generic RFC-CTR
* streams. See `aes_ctr_prng.h` for a prominent header-level note.
*
* Threading:
* - `tls_op_fd` is thread-local; each thread owns its kernel op-socket.
* - The kernel cipher is re-entrant. No shared state in this TU is writable
* across threads.
*
* Error handling:
* - Functions return `0` on success and `-1` on failure. When underlying
* syscalls fail, `-1` is returned; callers may consult `errno` as usual.
*/
// ============================================================================================
// WHY THIS FILE EXISTS
// --------------------------------------------------------------------------------------------
// nwipe, a secure disk-wiping tool, needs cryptographically strong random data at multi-GB/s
// in order to keep up with todays NVMe and RAID arrays. Users complained when the classic
// user-space OpenSSL path plateaued around ~250 MB/s on modern CPUs. The Linux kernel
// already ships an extremely fast AES implementation (with transparent AES-NI / VAES / NEON
// acceleration) that can be accessed from user space via the AF_ALG socket family. By
// delegating the heavy crypto to the kernel we gain all of the following *for free*:
// • Perfectly tuned instruction selection per CPU (AES-NI, VAES, SVE, etc.)
// • Full cache-line prefetch scheduling written by kernel crypto maintainers
// • Zero-copy when the cipher runs in the same core
// • Automatic fall-back to software if the CPU lacks AES-NI
//
// DESIGN OVERVIEW (TL;DR)
// ----------------------
// ┌─ userspace ───────────────────────────────────────────────────────────────────────────────┐
// │ +-------------------------------+ │
// │ nwipe | aes_ctr_state_t (256 bit) | (1) initialise, store key+counter │
// │ +-------------------------------+ │
// │ │ ▲ │
// │ │ (2) sendmsg() + read() per fixed-size chunk │ │
// └─────────────────────┼───────────────────────────────────────────────────────────┤ kernel │
// │ │ space │
// persistent FD ▼ │ │
// ┌──────────────────────┐ │ │
// │ AF_ALG op socket │ (ctr(aes)) │ │
// └──────────────────────┘ └─────────┘
//
// Public ABI stability:
// ---------------------
// The fd is *not* part of the public state (preserves libnwipe ABI). A TU-local,
// thread-local descriptor is used internally; multiple PRNG instances per thread
// share the same op-socket as needed.
//
// Safety / threading:
// -------------------
// • The kernel cipher is re-entrant. Thread-local fd avoids cross-thread hazards.
// • Counter increments occur in user space; one aes_ctr_state_t per thread.
// ==============================================================================================
#include "aes_ctr_prng.h" // public header (256-bit state, extern "C" API)
#include <sys/socket.h> // socket(), bind(), accept(), sendmsg()
#include <linux/if_alg.h> // AF_ALG constants and skcipher API
#include <unistd.h> // read(), close()
#include <cstring> // memcpy(), memset(), strcpy()
#include <array> // std::array for control buffer
// ----------------------------------------------------------------------------------------------
// CONFIGURABLE CHUNK SIZE
// ----------------------------------------------------------------------------------------------
// The per-call output size (CHUNK) can be configured at compile time via
// AES_CTR_PRNG_CHUNK_BYTES. Default is 128 KiB.
// Example:
// gcc -DAES_CTR_PRNG_CHUNK_BYTES="(64u*1024u)" ...
// ----------------------------------------------------------------------------------------------
#ifndef AES_CTR_PRNG_CHUNK_BYTES
#define AES_CTR_PRNG_CHUNK_BYTES (128u * 1024u) // 128 KiB default
#endif
// ----------------------------------------------------------------------------------------------
// GLOBAL 256-BIT KEY
// ----------------------------------------------------------------------------------------------
// • Loaded from user-supplied seed in aes_ctr_prng_init().
// • Intended to remain constant for the process lifetime (or until re-init).
// • Exposed (non-static) so out-of-TU tests can assert correct key handling.
//
// @note Consider zeroizing on shutdown to avoid key retention in core dumps.
// ----------------------------------------------------------------------------------------------
unsigned char global_key[32];
// ----------------------------------------------------------------------------------------------
// THREAD-LOCAL OPERATION SOCKET (one per nwipe thread)
// ----------------------------------------------------------------------------------------------
// Portable TLS qualifier: C++11 `thread_local` or GCC/Clang `__thread` for C builds.
//
// @invariant tls_op_fd == -1 ⇒ this thread has not opened the op-socket yet.
// tls_op_fd >= 0 ⇒ valid AF_ALG operation socket for "ctr(aes)".
//
// @thread_safety Thread-local; no synchronization required.
// ----------------------------------------------------------------------------------------------
#if defined(__cplusplus) && __cplusplus >= 201103L
#define PRNG_THREAD_LOCAL thread_local
#else
#define PRNG_THREAD_LOCAL __thread
#endif
PRNG_THREAD_LOCAL static int tls_op_fd = -1; // -1 ⇒ not yet opened in this thread
// ----------------------------------------------------------------------------------------------
// CONSTANTS / INTERNAL HELPERS
// ----------------------------------------------------------------------------------------------
namespace {
/**
* @brief AES block size in bytes (by specification).
*/
constexpr std::size_t AES_BLOCK = 16u;
/**
* @brief Fixed-size generation granularity per kernel call.
* @details
* Adjust at build time via AES_CTR_PRNG_CHUNK_BYTES to balance syscall
* overhead vs. latency and memory traffic.
* Typical values: 16 KiB (legacy default), 64 KiB, 128 KiB.
*/
constexpr std::size_t CHUNK = AES_CTR_PRNG_CHUNK_BYTES;
static_assert(CHUNK % AES_BLOCK == 0,
"AES_CTR_PRNG_CHUNK_BYTES must be a multiple of AES_BLOCK (16 bytes)");
/// Number of AES-CTR blocks produced per CHUNK.
constexpr std::size_t BLOCKS_PER_CHUNK = CHUNK / AES_BLOCK;
/**
* @brief Store a 64-bit integer in little-endian byte order.
*
* @param v 64-bit value.
* @param buf Destination pointer; must point to at least 8 writable bytes.
*
* @note
* This function enforces a little-endian layout regardless of host endianness.
* For hot paths you may consider an optimized version using memcpy/bswap on
* big-endian hosts instead of byte-wise stores.
*/
static inline void store64_le(uint64_t v, unsigned char *buf)
{
for (int i = 0; i < 8; ++i)
buf[i] = static_cast<unsigned char>(v >> (8 * i));
}
/**
* @class ControlBuilder
* @brief Helper to assemble `msghdr` and control messages for AF_ALG.
*
* @details
* Builds the control payload for one `sendmsg()` call against an AF_ALG
* skcipher operation socket:
* - Control message #1: `ALG_SET_OP = ALG_OP_ENCRYPT`
* - Control message #2: `ALG_SET_IV` with a 128-bit IV
* - Data iovec: points at the plaintext buffer (here: zero-bytes of length CHUNK)
*
* All data structures live on the stack; constructing this helper is O(1).
*
* @note
* The kernel expects `ivlen` as a host-endian u32 followed by `iv` bytes.
* "Network order not required" is intentional for AF_ALG skcipher IVs.
*/
class ControlBuilder {
public:
/**
* @param iv 128-bit IV (counter value), passed as 16 bytes.
* @param plain Pointer to plaintext buffer (here: all-zero array).
* @param len Plaintext length in bytes; determines keystream length.
*/
ControlBuilder(const unsigned char iv[16], void *plain, size_t len)
{
// ---------- Data iovec ----------
iov_.iov_base = plain;
iov_.iov_len = len;
// ---------- msghdr --------------
msg_.msg_name = nullptr; // already bound via bind()
msg_.msg_namelen = 0;
msg_.msg_iov = &iov_;
msg_.msg_iovlen = 1;
msg_.msg_control = control_.data();
msg_.msg_controllen = control_.size();
msg_.msg_flags = 0;
// ---------- CMSG #1 : ALG_SET_OP = ENCRYPT ----------
cmsghdr *c1 = CMSG_FIRSTHDR(&msg_);
c1->cmsg_level = SOL_ALG;
c1->cmsg_type = ALG_SET_OP;
c1->cmsg_len = CMSG_LEN(sizeof(uint32_t));
*reinterpret_cast<uint32_t*>(CMSG_DATA(c1)) = ALG_OP_ENCRYPT;
// ---------- CMSG #2 : ALG_SET_IV ----------
cmsghdr *c2 = CMSG_NXTHDR(&msg_, c1);
c2->cmsg_level = SOL_ALG;
c2->cmsg_type = ALG_SET_IV;
c2->cmsg_len = CMSG_LEN(sizeof(uint32_t) + 16);
uint32_t ivlen = 16; // host endian; not network order
std::memcpy(CMSG_DATA(c2), &ivlen, sizeof(ivlen));
std::memcpy(CMSG_DATA(c2) + sizeof(ivlen), iv, 16);
}
/// @return Prepared msghdr suitable for `sendmsg()`.
struct msghdr *msg() { return &msg_; }
private:
// Control buffer sufficient for both control messages.
std::array<char,
CMSG_SPACE(sizeof(uint32_t)) +
CMSG_SPACE(sizeof(uint32_t) + 16)> control_{};
struct msghdr msg_{};
struct iovec iov_{};
};
/**
* @brief Open a "ctr(aes)" skcipher operation socket via AF_ALG.
*
* @details
* Performs the `socket()` → `bind()` → `setsockopt(ALG_SET_KEY)` → `accept()`
* sequence. The returned fd is the operation socket used for `sendmsg()`+`read()`.
*
* @param key AES key (32 bytes for AES-256).
* @return Operation socket fd (>= 0) on success, or -1 on failure.
*
* @warning
* This function does not set `FD_CLOEXEC` nor handle `SIGPIPE`. Consider using
* `SOCK_CLOEXEC` on `socket()` and `accept4()` and `MSG_NOSIGNAL` on `sendmsg()`
* in hardened builds.
*/
static int open_ctr_socket(const unsigned char key[32])
{
// 1. Create transform socket (AF_ALG family).
int tfm = ::socket(AF_ALG, SOCK_SEQPACKET, 0);
if (tfm < 0) return -1;
// 2. Describe requested algorithm: type = "skcipher", name = "ctr(aes)".
sockaddr_alg sa = {};
sa.salg_family = AF_ALG;
std::strcpy(reinterpret_cast<char*>(sa.salg_type), "skcipher");
std::strcpy(reinterpret_cast<char*>(sa.salg_name), "ctr(aes)");
if (::bind(tfm, reinterpret_cast<sockaddr*>(&sa), sizeof(sa)) < 0) {
::close(tfm); return -1;
}
// 3. Upload 256-bit key.
if (::setsockopt(tfm, SOL_ALG, ALG_SET_KEY, key, 32) < 0) {
::close(tfm); return -1;
}
// 4. Accept operation socket — the fd we will use for sendmsg/read.
int op = ::accept(tfm, nullptr, nullptr);
::close(tfm); // transform socket no longer needed
return op; // may be -1 on error
}
/**
* @brief Increment a 128-bit little-endian counter by @p n AES blocks.
*
* @details
* The counter is represented as two 64-bit little-endian limbs in state->s[0..1].
* The increment is performed modulo 2^128 with carry propagation from low to high.
*
* @param st PRNG state with s[0]=lo, s[1]=hi limbs.
* @param n Number of AES blocks to add.
*
* @note
* This is **little-endian** counter arithmetic; see the big file-level warning
* about non-RFC CTR semantics.
*/
static void ctr_add(aes_ctr_state_t *st, uint64_t n)
{
uint64_t old = st->s[0];
st->s[0] += n;
if (st->s[0] < old) ++st->s[1]; // handle carry
}
} // namespace (anonymous)
// =================================================================================================
// PUBLIC C API IMPLEMENTATION
// =================================================================================================
extern "C" {
/**
* @brief Initialize PRNG state and lazily open the per-thread AF_ALG socket.
*
* @param[out] state Pointer to PRNG state (must be non-null).
* @param[in] init_key Seed as an array of unsigned long; must provide >= 32 bytes.
* @param[in] key_length Number of `unsigned long` words in @p init_key.
*
* @retval 0 Success.
* @retval -1 Invalid parameters or AF_ALG setup failure.
*
* @details
* - Zeroes the entire state and copies the first 128 bits of the seed into the
* 128-bit counter (little-endian limb order).
* - Saves the first 256 bits as the AES-256 key in @c global_key.
* - Opens the AF_ALG operation socket for "ctr(aes)" on first call in this
* thread and stores the fd in thread-local storage.
*
* @warning
* The chosen IV scheme is little-endian and not RFC-interoperable.
* Do not mix with external AES-CTR generators expecting big-endian counters.
*/
int aes_ctr_prng_init(aes_ctr_state_t *state,
unsigned long init_key[],
unsigned long key_length)
{
if (!state || !init_key || key_length * sizeof(unsigned long) < 32)
return -1;
// Zero entire state, then put seed[0..15] into counter (LE limbs).
std::memset(state, 0, sizeof(*state));
std::memcpy(state->s, init_key, sizeof(uint64_t) * 2);
// Remember full AES-256 key (32 bytes) for possible re-opens.
std::memcpy(global_key, init_key, 32);
// Open per-thread socket on first call in this thread.
if (tls_op_fd == -1) {
tls_op_fd = open_ctr_socket(global_key);
if (tls_op_fd < 0) return -1;
}
return 0;
}
/**
* @brief Produce exactly CHUNK bytes of keystream into @p bufpos.
*
* @param[in] state PRNG state (counter source).
* @param[out] bufpos Destination buffer; must hold at least CHUNK bytes.
*
* @retval 0 Success (CHUNK bytes written).
* @retval -1 Parameter error or syscall failure.
*
* @details
* Sequence per call:
* 1. Assemble a 128-bit IV by storing s[0] (low) and s[1] (high) as
* little-endian 64-bit words into a 16-byte buffer.
* 2. Build the AF_ALG control message (ALG_SET_OP=ENCRYPT, ALG_SET_IV=IV)
* and data iovec pointing to a static all-zero plaintext of length CHUNK.
* 3. `sendmsg()` the request and `read()` back exactly CHUNK bytes of
* ciphertext — which, because plaintext is zero, equals the keystream.
* 4. Increment the 128-bit counter by `BLOCKS_PER_CHUNK`.
*
* @note
* The zero-plaintext buffer is static and zero-initialized once; the kernel
* will not read uninitialized memory. Using zero plaintext is standard for
* obtaining the raw AES-CTR keystream.
*/
int aes_ctr_prng_genrand_128k_to_buf(aes_ctr_state_t *state,
unsigned char *bufpos)
{
if (!state || !bufpos || tls_op_fd < 0)
return -1;
// --- Construct 128-bit IV from counter (little-endian limbs) -------------
unsigned char iv[16];
store64_le(state->s[0], iv); // little-endian low limb
store64_le(state->s[1], iv + 8); // little-endian high limb
// --- Build msghdr ---------------------------------------------------------
static unsigned char zeros[CHUNK] = {0}; // static → zero-initialised once
ControlBuilder ctl(iv, zeros, CHUNK);
// --- sendmsg() + read() ---------------------------------------------------
if (::sendmsg(tls_op_fd, ctl.msg(), 0) != (ssize_t)CHUNK) return -1;
if (::read (tls_op_fd, bufpos, CHUNK) != (ssize_t)CHUNK) return -1;
// --- Advance counter ------------------------------------------------------
ctr_add(state, BLOCKS_PER_CHUNK);
return 0;
}
/**
* @brief Optional cleanup helper (explicitly closes the per-thread op-socket).
*
* @retval 0 Always succeeds.
*
* @details
* The kernel will close FDs at process exit, but explicit shutdown helps
* test harnesses and avoids keeping descriptors alive across exec().
* Consider zeroizing @c global_key here for defense-in-depth.
*/
int aes_ctr_prng_shutdown(void)
{
if (tls_op_fd >= 0) {
::close(tls_op_fd);
tls_op_fd = -1;
}
return 0;
}
} // extern "C"

60
src/aes/aes_ctr_prng.h Normal file
View File

@@ -0,0 +1,60 @@
#ifndef AES_CTR_PRNG_H
#define AES_CTR_PRNG_H
/* Minimal public header for AES-256-CTR PRNG (Linux AF_ALG backend)
*
* Implementation detail:
* - Uses a persistent AF_ALG "ctr(aes)" operation socket opened at init.
* - No socket setup overhead during generation only sendmsg + read.
* - Thread-safety: Not safe unless externally synchronized.
*
* Public state remains exactly 256 bits (4×64-bit words) to allow for
* minimalistic integration in nwipe and similar tools.
*/
#include <stdint.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/* PRNG state: exactly 256 bits (4 × 64-bit words)
*
* s[0] = counter low
* s[1] = counter high
* s[2], s[3] = reserved
*/
typedef struct aes_ctr_state_s {
uint64_t s[4];
} aes_ctr_state_t;
/* Initialize with >=32 bytes of seed (init_key as unsigned-long array)
*
* On first call, also opens the persistent AF_ALG socket.
* Returns 0 on success, -1 on failure.
*/
int aes_ctr_prng_init(aes_ctr_state_t *state,
unsigned long init_key[],
unsigned long key_length);
/* Generate one 128 KiB chunk of random data into bufpos.
*
* Returns 0 on success, -1 on failure.
* Uses the persistent AF_ALG socket.
*/
int aes_ctr_prng_genrand_128k_to_buf(aes_ctr_state_t *state,
unsigned char *bufpos);
/* Optional: Close the persistent AF_ALG socket at program shutdown.
*
* Not required by nwipe, but recommended for tools embedding this code.
*/
int aes_ctr_prng_shutdown(void);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* AES_CTR_PRNG_H */

View File

@@ -1640,14 +1640,14 @@ void nwipe_gui_prng( void )
extern nwipe_prng_t nwipe_twister;
extern nwipe_prng_t nwipe_isaac;
extern nwipe_prng_t nwipe_isaac64;
extern nwipe_prng_t nwipe_aes_ctr_prng;
extern nwipe_prng_t nwipe_xoroshiro256_prng;
extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng;
extern nwipe_prng_t nwipe_aes_ctr_prng;
extern int terminate_signal;
/* The number of implemented PRNGs. */
const int count = 5;
const int count = 6;
/* The first tabstop. */
const int tab1 = 2;
@@ -1689,6 +1689,10 @@ void nwipe_gui_prng( void )
{
focus = 4;
}
if( nwipe_options.prng == &nwipe_aes_ctr_prng )
{
focus = 5;
}
do
{
/* Clear the main window. */
@@ -1705,6 +1709,7 @@ void nwipe_gui_prng( void )
mvwprintw( main_window, yy++, tab1, " %s", nwipe_isaac64.label );
mvwprintw( main_window, yy++, tab1, " %s", nwipe_add_lagg_fibonacci_prng.label );
mvwprintw( main_window, yy++, tab1, " %s", nwipe_xoroshiro256_prng.label );
mvwprintw( main_window, yy++, tab1, " %s", nwipe_aes_ctr_prng.label );
yy++;
/* Print the cursor. */
@@ -1879,6 +1884,30 @@ void nwipe_gui_prng( void )
tab1,
"especially for legacy systems, due to its efficiency and minimal demands. " );
break;
case 5:
mvwprintw(
main_window, yy++, tab1, "AES-256 in Counter Mode (CTR), securely implemented by Fabian Druschke" );
mvwprintw( main_window, yy++, tab1, "using the Linux kernel's AF_ALG cryptographic API for efficient" );
mvwprintw( main_window, yy++, tab1, "pseudo-random data generation with minimal user-space overhead." );
mvwprintw( main_window,
yy++,
tab1,
" " );
mvwprintw(
main_window, yy++, tab1, "This integration leverages potential hardware acceleration via AES-NI," );
mvwprintw(
main_window, yy++, tab1, "making AES-256 CTR ideal for secure and fast data wiping in nwipe." );
mvwprintw( main_window,
yy++,
tab1,
" " );
mvwprintw( main_window,
yy++,
tab1,
"Compliant with NIST SP 800-38A, it is a global standard for encryption." );
mvwprintw(
main_window, yy++, tab1, "Designed for 64-bit Linux systems with kernel CryptoAPI support." );
break;
}
/* switch */
@@ -1949,6 +1978,11 @@ void nwipe_gui_prng( void )
{
nwipe_options.prng = &nwipe_xoroshiro256_prng;
}
if( focus == 5 )
{
nwipe_options.prng = &nwipe_aes_ctr_prng;
}
return;
case KEY_BACKSPACE:

View File

@@ -32,6 +32,49 @@
/* The global options struct. */
nwipe_options_t nwipe_options;
/*
* Executes the CPUID instruction and fills out the provided variables with the results.
* eax: The function/subfunction number to query with CPUID.
* *eax_out, *ebx_out, *ecx_out, *edx_out: Pointers to variables where the CPUID output will be stored.
*/
void cpuid( uint32_t eax, uint32_t* eax_out, uint32_t* ebx_out, uint32_t* ecx_out, uint32_t* edx_out )
{
#if defined( __i386__ ) || defined( __x86_64__ ) /* only on x86 */
#if defined( _MSC_VER ) /* MSVC */
int r[4];
__cpuid( r, eax );
*eax_out = r[0];
*ebx_out = r[1];
*ecx_out = r[2];
*edx_out = r[3];
#elif defined( __GNUC__ ) /* GCC/Clang */
__asm__ __volatile__( "cpuid"
: "=a"( *eax_out ), "=b"( *ebx_out ), "=c"( *ecx_out ), "=d"( *edx_out )
: "a"( eax ) );
#else
#error "Unsupported compiler"
#endif
#else /* not-x86 */
(void) eax;
*eax_out = *ebx_out = *ecx_out = *edx_out = 0; /* CPUID = 0 */
#endif
}
/*
* Checks if the AES-NI instruction set is supported by the processor.
* Returns 1 (true) if supported, 0 (false) otherwise.
*/
int has_aes_ni( void )
{
#if defined( __i386__ ) || defined( __x86_64__ ) /* only for x86 */
uint32_t eax, ebx, ecx, edx;
cpuid( 1, &eax, &ebx, &ecx, &edx );
return ( ecx & ( 1u << 25 ) ) != 0; /* Bit 25 = AES-NI */
#else /* ARM, RISC-V … */
return 0; /* no AES-NI */
#endif
}
int nwipe_options_parse( int argc, char** argv )
{
extern char* optarg; // The working getopt option argument.
@@ -44,6 +87,7 @@ int nwipe_options_parse( int argc, char** argv )
extern nwipe_prng_t nwipe_isaac64;
extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng;
extern nwipe_prng_t nwipe_xoroshiro256_prng;
extern nwipe_prng_t nwipe_aes_ctr_prng;
/* The getopt() result holder. */
int nwipe_opt;
@@ -133,8 +177,26 @@ int nwipe_options_parse( int argc, char** argv )
nwipe_options.autonuke = 0;
nwipe_options.autopoweroff = 0;
nwipe_options.method = &nwipe_random;
nwipe_options.prng =
( sizeof( unsigned long int ) >= 8 ) ? &nwipe_xoroshiro256_prng : &nwipe_add_lagg_fibonacci_prng;
/*
* Determines and sets the default PRNG based on AES-NI support and system architecture.
* It selects AES-CTR PRNG if AES-NI is supported, xoroshiro256 for 64-bit systems without AES-NI,
* and add lagged Fibonacci for 32-bit systems.
*/
if( has_aes_ni() )
{
nwipe_options.prng = &nwipe_aes_ctr_prng;
}
else if( sizeof( unsigned long int ) >= 8 )
{
nwipe_options.prng = &nwipe_xoroshiro256_prng;
nwipe_log( NWIPE_LOG_WARNING, "CPU doesn't support AES New Instructions, opting for XORoshiro-256 instead." );
}
else
{
nwipe_options.prng = &nwipe_add_lagg_fibonacci_prng;
}
nwipe_options.rounds = 1;
nwipe_options.noblank = 0;
nwipe_options.nousb = 0;
@@ -557,6 +619,11 @@ int nwipe_options_parse( int argc, char** argv )
nwipe_options.prng = &nwipe_xoroshiro256_prng;
break;
}
if( strcmp( optarg, "aes_ctr_prng" ) == 0 )
{
nwipe_options.prng = &nwipe_aes_ctr_prng;
break;
}
/* Else we do not know this PRNG. */
fprintf( stderr, "Error: Unknown prng '%s'.\n", optarg );
@@ -615,6 +682,7 @@ void nwipe_options_log( void )
extern nwipe_prng_t nwipe_isaac64;
extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng;
extern nwipe_prng_t nwipe_xoroshiro256_prng;
extern nwipe_prng_t nwipe_aes_ctr_prng;
/**
* Prints a manifest of options to the log.
@@ -674,6 +742,10 @@ void nwipe_options_log( void )
{
nwipe_log( NWIPE_LOG_NOTICE, " prng = XORoshiro-256" );
}
else if( nwipe_options.prng == &nwipe_aes_ctr_prng )
{
nwipe_log( NWIPE_LOG_NOTICE, " prng = AES-CTR New Instructions (EXPERIMENTAL!)" );
}
else if( nwipe_options.prng == &nwipe_isaac )
{
nwipe_log( NWIPE_LOG_NOTICE, " prng = Isaac" );
@@ -766,7 +838,7 @@ void display_help()
puts( " -P, --PDFreportpath=PATH Path to write PDF reports to. Default is \".\"" );
puts( " If set to \"noPDF\" no PDF reports are written.\n" );
puts( " -p, --prng=METHOD PRNG option "
"(mersenne|twister|isaac|isaac64|add_lagg_fibonacci_prng|xoroshiro256_prng)\n" );
"(mersenne|twister|isaac|isaac64|add_lagg_fibonacci_prng|xoroshiro256_prng|aes_ctr_prng)\n" );
puts( " -q, --quiet Anonymize logs and the GUI by removing unique data, i.e." );
puts( " serial numbers, LU WWN Device ID, and SMBIOS/DMI data." );
puts( " XXXXXX = S/N exists, ????? = S/N not obtainable\n" );

View File

@@ -27,6 +27,7 @@
#include "isaac_rand/isaac64.h"
#include "alfg/add_lagg_fibonacci_prng.h" //Lagged Fibonacci generator prototype
#include "xor/xoroshiro256_prng.h" //XORoshiro-256 prototype
#include "aes/aes_ctr_prng.h" // AES-NI prototype
nwipe_prng_t nwipe_twister = { "Mersenne Twister (mt19937ar-cok)", nwipe_twister_init, nwipe_twister_read };
@@ -40,6 +41,9 @@ nwipe_prng_t nwipe_add_lagg_fibonacci_prng = { "Lagged Fibonacci generator",
/* XOROSHIRO-256 PRNG Structure */
nwipe_prng_t nwipe_xoroshiro256_prng = { "XORoshiro-256", nwipe_xoroshiro256_prng_init, nwipe_xoroshiro256_prng_read };
/* AES-CTR-NI PRNG Structure */
nwipe_prng_t nwipe_aes_ctr_prng = { "AES-CTR (Kernel)", nwipe_aes_ctr_prng_init, nwipe_aes_ctr_prng_read };
/* Print given number of bytes from unsigned integer number to a byte stream buffer starting with low-endian. */
static inline void u32_to_buffer( u8* restrict buffer, u32 val, const int len )
{
@@ -340,3 +344,340 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE )
return 0; // Success
}
/**
* @brief Initialize the AES-CTR PRNG state for this thread.
*
* @details
* Initializes the thread-local PRNG based on the supplied seed and resets the
* ring-buffer prefetch cache. The underlying AES-CTR implementation uses a
* persistent AF_ALG operation socket per thread, opened lazily by
* aes_ctr_prng_init(). The public state only stores a 128-bit counter while
* the kernel keeps the expanded AES key schedule.
*
* @param[in,out] state Pointer to an opaque PRNG state handle. If `*state` is
* `NULL`, this function allocates it with `calloc()`.
* @param[in] seed Seed material (must contain at least 32 bytes).
* @param[in] ... Remaining parameters as defined by NWIPE_PRNG_INIT_SIGNATURE.
*
* @note
* The ring is intentionally left empty to keep init fast. Callers may choose to
* "prefill" by invoking refill_stash_thread_local(*state, SIZE_OF_AES_CTR_PRNG)
* once to amortize first-use latency for tiny reads.
*
* @retval 0 Success.
* @retval -1 Allocation or initialization failure (already logged).
*/
/*
* High-throughput wrapper with a thread-local ring-buffer prefetch
* ----------------------------------------------------------------
* This glue layer implements NWIPE_PRNG_INIT / NWIPE_PRNG_READ around the
* persistent kernel-AES PRNG. It maintains a lock-free, thread-local ring
* buffer ("stash") that caches keystream blocks produced in fixed-size chunks
* (SIZE_OF_AES_CTR_PRNG; e.g., 16 KiB or 256 KiB).
*
* Rationale:
* - Nwipe frequently requests small slices (e.g., 32 B, 512 B, 4 KiB). Issuing
* one kernel call per small read would be syscall- and copy-bound.
* - By fetching larger chunks and serving small reads from the ring buffer,
* we reduce syscall rate and memory traffic and approach memcpy-limited
* throughput on modern CPUs with AES acceleration.
*
* Why a ring buffer (over a linear stash + memmove):
* - No O(n) memmove() when the buffer fills with a tail of unread bytes.
* - Constant-time head/tail updates via modulo arithmetic.
* - Better cache locality and fewer TLB/cache misses; improved prefetching.
*/
/** @def NW_THREAD_LOCAL
* @brief Portable thread-local specifier for C11 and GNU C.
*
* The ring buffer and its indices are thread-local, so no synchronization
* (locks/atomics) is required. Do not share this state across threads.
*/
#if defined( __STDC_VERSION__ ) && __STDC_VERSION__ >= 201112L
#define NW_THREAD_LOCAL _Thread_local
#else
#define NW_THREAD_LOCAL __thread
#endif
/** @def NW_ALIGN
* @brief Minimal alignment helper for hot buffers/structures.
*
* 64-byte alignment targets typical cacheline boundaries to reduce false
* sharing and improve hardware prefetch effectiveness for linear scans.
*/
#if defined( __GNUC__ ) || defined( __clang__ )
#define NW_ALIGN( N ) __attribute__( ( aligned( N ) ) )
#else
#define NW_ALIGN( N ) _Alignas( N )
#endif
/**
* @def STASH_CAPACITY
* @brief Ring capacity in bytes (power-of-two; multiple of CHUNK).
*
* @details
* Defaults to 1 MiB. Must be:
* - a power of two (allows modulo via bitmask),
* - a multiple of SIZE_OF_AES_CTR_PRNG, so each produced chunk fits whole.
*
* @note
* Practical choices: 512 KiB … 4 MiB depending on CHUNK size and workload.
* For SIZE_OF_AES_CTR_PRNG = 256 KiB, 1 MiB yields four in-flight chunks and
* works well for nwipes small-read patterns.
*/
#ifndef STASH_CAPACITY
#define STASH_CAPACITY ( 1u << 20 ) /* 1 MiB */
#endif
#if defined( __STDC_VERSION__ ) && __STDC_VERSION__ >= 201112L
_Static_assert( ( STASH_CAPACITY & ( STASH_CAPACITY - 1 ) ) == 0, "STASH_CAPACITY must be a power of two" );
_Static_assert( ( STASH_CAPACITY % SIZE_OF_AES_CTR_PRNG ) == 0,
"STASH_CAPACITY must be a multiple of SIZE_OF_AES_CTR_PRNG" );
#endif
/** @brief Thread-local ring buffer storage for prefetched keystream. */
NW_THREAD_LOCAL static unsigned char stash[STASH_CAPACITY] NW_ALIGN( 64 );
/**
* @name Ring indices (thread-local)
* @{
* @var rb_head Next read position (consumer cursor).
* @var rb_tail Next write position (producer cursor).
* @var rb_count Number of valid bytes currently stored.
*
* @invariant
* - 0 <= rb_count <= STASH_CAPACITY
* - rb_head, rb_tail in [0, STASH_CAPACITY)
* - (rb_tail - rb_head) mod STASH_CAPACITY == rb_count
*
* @warning
* These variables are TLS and must not be accessed from or shared with other
* threads. One PRNG instance per thread.
* @}
*/
NW_THREAD_LOCAL static size_t rb_head = 0; /* next byte to read */
NW_THREAD_LOCAL static size_t rb_tail = 0; /* next byte to write */
NW_THREAD_LOCAL static size_t rb_count = 0; /* occupied bytes */
/**
* @brief Free space available in the ring (bytes).
* @return Number of free bytes (0 … STASH_CAPACITY).
*/
static inline size_t rb_free( void )
{
return STASH_CAPACITY - rb_count;
}
/**
* @brief Contiguous readable bytes starting at @c rb_head (no wrap).
* @return Number of contiguous bytes available to read without split memcpy.
*/
static inline size_t rb_contig_used( void )
{
size_t to_end = STASH_CAPACITY - rb_head;
return ( rb_count < to_end ) ? rb_count : to_end;
}
/**
* @brief Contiguous writable bytes starting at @c rb_tail (no wrap).
* @return Number of contiguous bytes available to write without wrap.
*/
static inline size_t rb_contig_free( void )
{
size_t to_end = STASH_CAPACITY - rb_tail;
size_t free = rb_free();
return ( free < to_end ) ? free : to_end;
}
/**
* @brief Ensure at least @p need bytes are buffered in the ring.
*
* @details
* Production model:
* - The kernel PRNG produces keystream in fixed-size chunks
* (SIZE_OF_AES_CTR_PRNG bytes; e.g., 16 KiB or 256 KiB).
* - We only ever append *whole* chunks. If total free space is less than one
* chunk, no production occurs (non-blocking style); the caller should first
* consume data and try again.
*
* Wrap handling:
* - Fast path: if a contiguous free region of at least one chunk exists at
* @c rb_tail, generate directly into @c stash + rb_tail (zero extra copies).
* - Wrap path: otherwise, generate one chunk into a small temporary buffer and
* split-copy into [rb_tail..end) and [0..rest). This case is infrequent and
* still cheaper than memmoving ring contents.
*
* @param[in] state Pointer to the AES-CTR state (per-thread).
* @param[in] need Minimum number of bytes the caller would like to have ready.
*
* @retval 0 Success (or no space to produce yet).
* @retval -1 PRNG failure (aes_ctr_prng_genrand_128k_to_buf() error).
*
* @warning
* Thread-local only. Do not call concurrently from multiple threads that share
* the same TLS variables.
*/
static int refill_stash_thread_local( void* state, size_t need )
{
while( rb_count < need )
{
/* Not enough total free space for a full CHUNK → let the caller read first. */
if( rb_free() < SIZE_OF_AES_CTR_PRNG )
break;
size_t cf = rb_contig_free();
if( cf >= SIZE_OF_AES_CTR_PRNG )
{
/* Fast path: generate straight into the ring. */
if( aes_ctr_prng_genrand_128k_to_buf( (aes_ctr_state_t*) state, stash + rb_tail ) != 0 )
return -1;
rb_tail = ( rb_tail + SIZE_OF_AES_CTR_PRNG ) & ( STASH_CAPACITY - 1 );
rb_count += SIZE_OF_AES_CTR_PRNG;
}
else
{
/* Wrap path: temporary production, then split-copy. */
unsigned char tmp[SIZE_OF_AES_CTR_PRNG];
if( aes_ctr_prng_genrand_128k_to_buf( (aes_ctr_state_t*) state, tmp ) != 0 )
return -1;
size_t first = STASH_CAPACITY - rb_tail; /* bytes to physical end */
memcpy( stash + rb_tail, tmp, first );
memcpy( stash, tmp + first, SIZE_OF_AES_CTR_PRNG - first );
rb_tail = ( rb_tail + SIZE_OF_AES_CTR_PRNG ) & ( STASH_CAPACITY - 1 );
rb_count += SIZE_OF_AES_CTR_PRNG;
}
}
return 0;
}
/* ---------------- PRNG INIT ---------------- */
/**
* @brief Thread-local initialization wrapper around @c aes_ctr_prng_init().
*
* @param[in,out] state Address of the callers PRNG state pointer. If `*state`
* is `NULL`, this function allocates one `aes_ctr_state_t`.
* @param[in] seed Seed descriptor as defined by NWIPE_PRNG_INIT_SIGNATURE.
*
* @retval 0 Success.
* @retval -1 Allocation or backend initialization failure (logged).
*
* @note
* Resets the ring buffer to empty. Consider a one-time prefill if your workload
* is dominated by tiny reads.
*/
int nwipe_aes_ctr_prng_init( NWIPE_PRNG_INIT_SIGNATURE )
{
nwipe_log( NWIPE_LOG_NOTICE, "Initializing AES-CTR PRNG (thread-local ring buffer)" );
if( *state == NULL )
{
*state = calloc( 1, sizeof( aes_ctr_state_t ) );
if( *state == NULL )
{
nwipe_log( NWIPE_LOG_FATAL, "calloc() failed for PRNG state" );
return -1;
}
}
int rc = aes_ctr_prng_init(
(aes_ctr_state_t*) *state, (unsigned long*) seed->s, seed->length / sizeof( unsigned long ) );
if( rc != 0 )
{
nwipe_log( NWIPE_LOG_ERROR, "aes_ctr_prng_init() failed" );
return -1;
}
/* Reset ring to empty. */
rb_head = rb_tail = rb_count = 0;
return 0;
}
/* ---------------- PRNG READ ---------------- */
/**
* @brief Copy @p count bytes of keystream into @p buffer.
*
* @details
* Strategy:
* - If the request is "large" (>= CHUNK) and the ring is empty, use the
* direct-fill fast path and generate full CHUNKs directly into the output
* buffer to avoid an extra memcpy.
* - Otherwise, serve from the ring:
* * Ensure at least one byte is available via @c refill_stash_thread_local
* (non-blocking; production occurs only if one full CHUNK fits).
* * Copy the largest contiguous block starting at @c rb_head.
* * Opportunistically prefetch when sufficient free space exists to keep
* latency low for upcoming small reads.
*
* @param[out] buffer Destination buffer to receive keystream.
* @param[in] count Number of bytes to generate and copy.
* @param[in] ... Remaining parameters as defined by NWIPE_PRNG_READ_SIGNATURE.
*
* @retval 0 Success (exactly @p count bytes written).
* @retval -1 Backend/IO failure (already logged).
*
* @warning
* Per-thread API: do not share this state across threads.
*/
int nwipe_aes_ctr_prng_read( NWIPE_PRNG_READ_SIGNATURE )
{
unsigned char* out = buffer;
size_t bytes_left = count;
/* Fast path: for large reads, bypass the ring if currently empty.
* Generate full CHUNKs directly into the destination to save one memcpy. */
while( bytes_left >= SIZE_OF_AES_CTR_PRNG && rb_count == 0 )
{
if( aes_ctr_prng_genrand_128k_to_buf( (aes_ctr_state_t*) *state, out ) != 0 )
{
nwipe_log( NWIPE_LOG_ERROR, "PRNG direct fill failed" );
return -1;
}
out += SIZE_OF_AES_CTR_PRNG;
bytes_left -= SIZE_OF_AES_CTR_PRNG;
}
/* General path: serve from ring, refilling as needed. */
while( bytes_left > 0 )
{
/* Ensure at least one byte is available for tiny reads. Refill only
* produces if a full CHUNK fits; otherwise we try again once consumer
* progress frees enough space. */
if( rb_count == 0 )
{
if( refill_stash_thread_local( *state, 1 ) != 0 )
{
nwipe_log( NWIPE_LOG_ERROR, "PRNG refill failed" );
return -1;
}
if( rb_count == 0 )
continue; /* still no room for a CHUNK yet */
}
/* Copy the largest contiguous span starting at rb_head. */
size_t avail = rb_contig_used();
size_t take = ( bytes_left < avail ) ? bytes_left : avail;
memcpy( out, stash + rb_head, take );
rb_head = ( rb_head + take ) & ( STASH_CAPACITY - 1 );
rb_count -= take;
out += take;
bytes_left -= take;
/* Opportunistic prefetch to hide latency of future small reads. */
if( rb_free() >= ( 2 * SIZE_OF_AES_CTR_PRNG ) )
{
if( refill_stash_thread_local( *state, SIZE_OF_AES_CTR_PRNG ) != 0 )
{
nwipe_log( NWIPE_LOG_ERROR, "PRNG opportunistic refill failed" );
return -1;
}
}
}
return 0;
}

View File

@@ -63,6 +63,10 @@ int nwipe_add_lagg_fibonacci_prng_read( NWIPE_PRNG_READ_SIGNATURE );
int nwipe_xoroshiro256_prng_init( NWIPE_PRNG_INIT_SIGNATURE );
int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE );
/* AES-CTR-NI prototypes. */
int nwipe_aes_ctr_prng_init( NWIPE_PRNG_INIT_SIGNATURE );
int nwipe_aes_ctr_prng_read( NWIPE_PRNG_READ_SIGNATURE );
/* Size of the twister is not derived from the architecture, but it is strictly 4 bytes */
#define SIZE_OF_TWISTER 4
@@ -76,4 +80,10 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE );
/* Size of the XOROSHIRO-256 is not derived from the architecture, but it is strictly 32 bytes */
#define SIZE_OF_XOROSHIRO256_PRNG 32
/* AES-CTR generation chunk size: fixed 128 KiB (not architecture-dependent) */
#define SIZE_OF_AES_CTR_PRNG ( 128 * 1024 )
/* Thread-local prefetch ring buffer capacity: 1 MiB */
#define STASH_CAPACITY ( 1024 * 1024 )
#endif /* PRNG_H_ */