diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 00ccaf27..bafc1f1a 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -87,10 +87,10 @@ jobs:
         - --examples --features=board/imxrt1010evk,board/lcd1602
         - --examples --features=board/imxrt1060evk,board/lcd1602
         # SPI examples (might break other examples)
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/teensy4,board/spi
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/imxrt1010evk,board/spi
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/imxrt1060evk,board/spi
-        - --example=hal_spi --example=rtic_spi --example=async_dma_spi --features=board/imxrt1170evk-cm7,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/teensy4,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/imxrt1010evk,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/imxrt1060evk,board/spi
+        - --example=rtic_spi_blocking --example=rtic_spi --example=async_dma_spi --features=board/imxrt1170evk-cm7,board/spi
         # The i.MX RT 1170 EVK (CM7) target is WIP. The list below describes the working examples.
         - --features=board/imxrt1170evk-cm7,board/lcd1602 --example=hal_led
           --example=hal_gpio_input --example=rtic_gpio_input
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1faed3c..99cce155 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,14 @@ Introduce LPSPI improvements:
 - Allow users to change the watermark while enabled. Deprecate the corresponding
   method on the `Disabled` helper.
 
+Change how the LPSPI driver manages the FIFOs. As a result of this change, the
+driver never returns the `Busy` or `NoData` errors through the embedded-hal
+interfaces. Instead of returning `Busy`, the driver blocks until there's space in
+the FIFO. If the caller provides an empty buffer, then the result is OK.
+
+The LPSPI embedded-hal (0.2) implementations will implicitly flush after blocking
+I/O. Users can rely on this behavior to synchronize external components.
+
 ## [0.5.4] 2023-11-26
 
 Add CCM APIs for configuring FlexIO clocks on 1000 targets.
diff --git a/Cargo.toml b/Cargo.toml
index 77c9dd02..a841283f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,6 +32,11 @@ default-features = false
 [dependencies.nb]
 version = "1"
 
+[dependencies.futures]
+version = "0.3.30"
+default-features = false
+features = ["async-await"]
+
 [dependencies.eh02]
 package = "embedded-hal"
 version = "0.2"
@@ -155,7 +160,7 @@ name = "async_dma_spi"
 required-features = ["board/spi"]
 
 [[example]]
-name = "hal_spi"
+name = "rtic_spi_blocking"
 required-features = ["board/spi"]
 
 [[example]]
diff --git a/examples/hal_spi.rs b/examples/hal_spi.rs
deleted file mode 100644
index ff4977e8..00000000
--- a/examples/hal_spi.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-//! Demonstrates a blocking SPI peripheral.
-//!
-//! Connect your SDI and SDO pins together, then run this example.
-//! The example prints success / errors to the board's serial console.
-//! You should see a 1MHz SPI clock, and that the elements of a write /
-//! transfer operation occur within a single low PCS.
-
-#![no_main]
-#![no_std]
-
-use imxrt_hal as hal;
-
-use eh02::{
-    blocking::serial::Write as _,
-    blocking::spi::{Transfer, Write},
-};
-use hal::lpspi::LpspiError;
-
-const GPT1_DELAY_MS: u32 = board::GPT1_FREQUENCY / 1_000 * 500;
-const GPT1_OCR: hal::gpt::OutputCompareRegister = hal::gpt::OutputCompareRegister::OCR1;
-
-/// Change me to experiment with different word sizes.
-/// Valid types: u8, u16, u32.
-type Elem = u8;
-
-fn write_error<T>(console: &mut board::Console, result: Result<T, LpspiError>) {
-    use hal::lpspi::Direction;
-    match result {
-        Err(LpspiError::Busy) => {
-            console.bwrite_all(b"Error: BUSY\r\n").ok();
-        }
-        Err(LpspiError::Fifo(Direction::Rx)) => {
-            console.bwrite_all(b"Error: RX FIFO\r\n").ok();
-        }
-        Err(LpspiError::Fifo(Direction::Tx)) => {
-            console.bwrite_all(b"Error: TX FIFO\r\n").ok();
-        }
-        Err(LpspiError::NoData) => {
-            console.bwrite_all(b"Error: NO DATA\r\n").ok();
-        }
-        Err(LpspiError::FrameSize) => {
-            console.bwrite_all(b"Error: FRAME SIZE\r\n").ok();
-        }
-        Ok(_) => {}
-    }
-}
-
-#[imxrt_rt::entry]
-fn main() -> ! {
-    let (
-        board::Common { mut gpt1, .. },
-        board::Specifics {
-            mut spi,
-            mut console,
-            ..
-        },
-    ) = board::new();
-
-    gpt1.set_output_compare_count(GPT1_OCR, GPT1_DELAY_MS);
-    gpt1.set_mode(hal::gpt::Mode::Restart);
-    gpt1.enable();
-
-    console.bwrite_all(b"Starting example...\r\n").ok();
-    loop {
-        let data: [Elem; 5] = [0xDE, 0xAD, 0xBE, 0xEF, 0xA5];
-        let mut buffer: [Elem; 5] = data;
-
-        while !gpt1.is_elapsed(GPT1_OCR) {}
-        gpt1.clear_elapsed(GPT1_OCR);
-
-        console.bwrite_all(b"Transfer... ").ok();
-        let result = spi.transfer(&mut buffer);
-        if result.is_err() {
-            write_error(&mut console, result);
-        } else if buffer != data {
-            console.bwrite_all(b"Data mismatch\r\n").ok();
-        } else {
-            console.bwrite_all(b"OK\r\n").ok();
-        }
-
-        while !gpt1.is_elapsed(GPT1_OCR) {}
-        gpt1.clear_elapsed(GPT1_OCR);
-
-        console.bwrite_all(b"Write... ").ok();
-        let result = spi.write(&buffer[..3]);
-        if result.is_err() {
-            write_error(&mut console, result);
-        } else {
-            console.bwrite_all(b"OK\r\n").ok();
-        }
-    }
-}
diff --git a/examples/rtic_spi_blocking.rs b/examples/rtic_spi_blocking.rs
new file mode 100644
index 00000000..b7e8af9e
--- /dev/null
+++ b/examples/rtic_spi_blocking.rs
@@ -0,0 +1,168 @@
+//! Demonstrates a SPI device with blocking I/O.
+//!
+//! Connect SDI to SDO. The example uses the LPSPI interrupt to
+//! schedule transfers, and to receive data. You can observe the
+//! I/O with a scope / logic analyzer. The SPI CLK runs at 1MHz.
+//!
+//! Keep an eye on the defmt log to see if tests fail.
+
+#![no_std]
+#![no_main]
+
+#[rtic::app(device = board, peripherals = false)]
+mod app {
+
+    use imxrt_hal as hal;
+
+    const PIT_DELAY_MS: u32 = board::PIT_FREQUENCY / 1_000 * 250;
+
+    #[local]
+    struct Local {
+        spi: board::Spi,
+        pit: hal::pit::Pit<2>,
+    }
+
+    #[shared]
+    struct Shared {}
+
+    #[init]
+    fn init(_: init::Context) -> (Shared, Local, init::Monotonics) {
+        let (
+            board::Common {
+                pit: (_, _, pit, _),
+                ..
+            },
+            board::Specifics { spi, .. },
+        ) = board::new();
+        (Shared {}, Local { spi, pit }, init::Monotonics())
+    }
+
+    #[idle(local = [spi, pit])]
+    fn idle(cx: idle::Context) -> ! {
+        let idle::LocalResources { spi, pit, .. } = cx.local;
+        pit.set_load_timer_value(PIT_DELAY_MS);
+
+        let mut delay = move || {
+            pit.enable();
+            while !pit.is_elapsed() {}
+            pit.clear_elapsed();
+            pit.disable();
+        };
+
+        loop {
+            for _ in 0..3 {
+                delay();
+            }
+
+            // For studying the effects of bit order and word size.
+            //
+            // If you have a logic analyzer that can change its word
+            // size and bit order, use this sequence to evaluate how
+            // the driver packs your transfer elements.
+            {
+                use eh02::blocking::spi::Write;
+                use hal::lpspi::BitOrder::{self, *};
+
+                const BIT_ORDERS: [BitOrder; 2] = [Msb, Lsb];
+
+                const U32_WORDS: [u32; 2] = [0xDEADBEEFu32, 0xAD1CAC1D];
+                for bit_order in BIT_ORDERS {
+                    spi.set_bit_order(bit_order);
+                    spi.write(&U32_WORDS).unwrap();
+                }
+
+                const U8_WORDS: [u8; 7] = [0xDEu8, 0xAD, 0xBE, 0xEF, 0xA5, 0x00, 0x1D];
+                for bit_order in BIT_ORDERS {
+                    spi.set_bit_order(bit_order);
+                    spi.write(&U8_WORDS).unwrap();
+                }
+
+                const U16_WORDS: [u16; 3] = [0xDEADu16, 0xBEEF, 0xA5A5];
+                for bit_order in BIT_ORDERS {
+                    spi.set_bit_order(bit_order);
+                    spi.write(&U16_WORDS).unwrap();
+                }
+
+                delay();
+            }
+
+            // Change me to explore bit order behavors in the
+            // remaining write / loopback transfer tests.
+            spi.set_bit_order(hal::lpspi::BitOrder::Msb);
+
+            // Make sure concatenated elements look correct on the wire.
+            {
+                use eh02::blocking::spi::Write;
+
+                spi.write(&[1u8, 2, 3]).unwrap();
+                spi.write(&[1u8, 2, 3, 4]).unwrap();
+                spi.write(&[1u8, 2, 3, 4, 5]).unwrap();
+                spi.write(&[1u8, 2, 3, 4, 5, 6]).unwrap();
+                spi.write(&[1u8, 2, 3, 4, 5, 6, 7]).unwrap();
+
+                spi.write(&[0x0102u16, 0x0304, 0x0506]).unwrap();
+                spi.write(&[0x0102u16, 0x0304, 0x0506, 0x0708]).unwrap();
+                spi.write(&[0x0102u16, 0x0304, 0x0506, 0x0708, 0x090A])
+                    .unwrap();
+
+                spi.write(&[0x01020304u32, 0x05060708, 0x090A0B0C]).unwrap();
+
+                delay();
+            }
+
+            {
+                use eh02::blocking::spi::{Transfer, Write};
+
+                // Change me to test different Elem sizes, buffer sizes,
+                // bit patterns.
+                type Elem = u8;
+                const SENTINEL: Elem = 0x0F;
+                const BUFFER: [Elem; 13] = [SENTINEL; 13];
+
+                // Simple loopback transfer. Easy to find with your
+                // scope.
+                let mut buffer = BUFFER;
+                spi.transfer(&mut buffer).unwrap();
+                if buffer != BUFFER {
+                    defmt::error!("Simple transfer buffer mismatch!");
+                }
+
+                delay();
+
+                // Adjacent loopback transfer. Look for the big
+                // burst of data on your scope.
+                let mut buffer = BUFFER;
+                let mut error = false;
+                for idx in 0u32..16 {
+                    buffer.fill(SENTINEL.rotate_right(idx));
+                    let expected = buffer;
+                    spi.transfer(&mut buffer).unwrap();
+                    error |= buffer != expected;
+                }
+                if error {
+                    defmt::error!("At least one of the bursted transfers didn't match!");
+                }
+
+                delay();
+
+                // Simple write.
+                let buffer = BUFFER;
+                spi.write(&buffer).unwrap();
+
+                delay();
+
+                // Pipelined writes. Look for the burst of data
+                // on your scope. Internally, the writes will flush,
+                // so the delay between transfers should be about
+                // the same as they are for the transfers.
+                let mut buffer = BUFFER;
+                for idx in 0..16 {
+                    buffer.fill(SENTINEL.rotate_right(idx));
+                    spi.write(&buffer).unwrap();
+                }
+
+                delay();
+            }
+        }
+    }
+}
diff --git a/src/common/lpspi.rs b/src/common/lpspi.rs
index c2c03115..900b4c54 100644
--- a/src/common/lpspi.rs
+++ b/src/common/lpspi.rs
@@ -78,6 +78,9 @@
 //! transactions. However, keep in mind that disabling the receiver during a continuous transaction
 //! may not work as expected.
 
+use core::marker::PhantomData;
+use core::task::Poll;
+
 use crate::iomuxc::{consts, lpspi};
 use crate::ral;
 
@@ -599,16 +602,6 @@ impl<P, const N: u8> Lpspi<P, N> {
         }
     }
 
-    /// Check for any receiver errors.
-    fn recv_ok(&self) -> Result<(), LpspiError> {
-        let status = self.status();
-        if status.intersects(Status::RECEIVE_ERROR) {
-            Err(LpspiError::Fifo(Direction::Rx))
-        } else {
-            Ok(())
-        }
-    }
-
     /// Place `word` into the transmit FIFO.
     ///
     /// This will result in the value being sent from the LPSPI.
@@ -618,17 +611,79 @@ impl<P, const N: u8> Lpspi<P, N> {
         ral::write_reg!(ral::lpspi, self.lpspi, TDR, word);
     }
 
-    pub(crate) fn wait_for_transmit_fifo_space(&mut self) -> Result<(), LpspiError> {
-        loop {
+    /// Wait for transmit FIFO space in a (concurrent) spin loop.
+    ///
+    /// This future does not care about the TX FIFO watermark. Instead, it
+    /// checks the FIFO's size with an additional read.
+    pub(crate) async fn spin_for_fifo_space(&self) -> Result<(), LpspiError> {
+        core::future::poll_fn(|_| {
             let status = self.status();
             if status.intersects(Status::TRANSMIT_ERROR) {
-                return Err(LpspiError::Fifo(Direction::Tx));
+                return Poll::Ready(Err(LpspiError::Fifo(Direction::Tx)));
             }
             let fifo_status = self.fifo_status();
             if !fifo_status.is_full(Direction::Tx) {
-                return Ok(());
+                Poll::Ready(Ok(()))
+            } else {
+                Poll::Pending
+            }
+        })
+        .await
+    }
+
+    pub(crate) fn wait_for_transmit_fifo_space(&self) -> Result<(), LpspiError> {
+        crate::spin_on(self.spin_for_fifo_space())
+    }
+
+    /// Wait for receive data in a (concurrent) spin loop.
+    ///
+    /// This future does not care about the RX FIFO watermark. Instead, it
+    /// checks the FIFO's size with an additional read.
+    async fn spin_for_word(&self) -> Result<u32, LpspiError> {
+        core::future::poll_fn(|_| {
+            let status = self.status();
+            if status.intersects(Status::RECEIVE_ERROR) {
+                return Poll::Ready(Err(LpspiError::Fifo(Direction::Rx)));
+            }
+
+            let fifo_status = self.fifo_status();
+            if !fifo_status.is_empty(Direction::Rx) {
+                let data = self.read_data_unchecked();
+                Poll::Ready(Ok(data))
+            } else {
+                Poll::Pending
             }
+        })
+        .await
+    }
+
+    /// Send `len` LPSPI words (u32s) out of the peripheral.
+    ///
+    /// Expected to run in a (concurrent) spin loop, possibly with
+    /// `spin_receive`.
+    async fn spin_transmit(
+        &self,
+        mut data: impl TransmitData,
+        len: usize,
+    ) -> Result<(), LpspiError> {
+        for _ in 0..len {
+            self.spin_for_fifo_space().await?;
+            let word = data.next_word(self.bit_order);
+            self.enqueue_data(word);
         }
+        Ok(())
+    }
+
+    /// Accept `len` LPSPI words (u32s) from the peripheral.
+    ///
+    /// Expected to run in a (concurrent) spin loop, possibly with
+    /// `spin_transmit`.
+    async fn spin_receive(&self, mut data: impl ReceiveData, len: usize) -> Result<(), LpspiError> {
+        for _ in 0..len {
+            let word = self.spin_for_word().await?;
+            data.next_word(word);
+        }
+        Ok(())
     }
 
     /// Set the SPI mode for the peripheral.
@@ -694,107 +749,55 @@ impl<P, const N: u8> Lpspi<P, N> {
         }
     }
 
-    /// Exchanges data with the SPI device.
-    ///
-    /// This routine uses continuous transfers to perform the transaction, no matter the
-    /// primitive type. There's an optimization for &[u32] that we're missing; in this case,
-    /// we don't necessarily need to use continuous transfers. The frame size could be set to
-    /// 8 * buffer.len() * sizeof(u32), and we copy user words into the transmit queue as-is.
-    /// But handling the packing of u8s and u16s into the u32 transmit queue in software is
-    /// extra work, work that's effectively achieved when we use continuous transfers.
-    /// We're guessing that the time to pop a transmit command from the queue is much faster
-    /// than the time taken to pop from the data queue, so the extra queue utilization shouldn't
-    /// matter.
-    fn exchange<W>(&mut self, buffer: &mut [W]) -> Result<(), LpspiError>
-    where
-        W: Word,
-    {
-        if self.status().intersects(Status::BUSY) {
-            return Err(LpspiError::Busy);
-        } else if buffer.is_empty() {
-            return Err(LpspiError::NoData);
+    fn exchange<W: Word>(&mut self, data: &mut [W]) -> Result<(), LpspiError> {
+        if data.is_empty() {
+            return Ok(());
         }
 
-        self.clear_fifos();
-
-        let mut transaction = Transaction::new(8 * core::mem::size_of::<W>() as u16)?;
+        let mut transaction = Transaction::new_words(data)?;
         transaction.bit_order = self.bit_order();
-        transaction.continuous = true;
-
-        let mut tx_idx = 0usize;
-        let mut rx_idx = 0usize;
 
-        // Continue looping while there is either tx OR rx remaining
-        while tx_idx < buffer.len() || rx_idx < buffer.len() {
-            if tx_idx < buffer.len() {
-                let word = buffer[tx_idx];
+        self.wait_for_transmit_fifo_space()?;
+        self.enqueue_transaction(&transaction);
 
-                // Turn off TCR CONT on last tx as a workaround so that the final
-                // falling edge comes through:
-                // https://community.nxp.com/t5/i-MX-RT/RT1050-LPSPI-last-bit-not-completing-in-continuous-mode/m-p/898460
-                if tx_idx + 1 == buffer.len() {
-                    transaction.continuous = false;
-                }
+        let word_count = word_count(data);
+        let (tx, rx) = transfer_in_place(data);
 
-                self.wait_for_transmit_fifo_space()?;
-                self.enqueue_transaction(&transaction);
+        crate::spin_on(futures::future::try_join(
+            self.spin_transmit(tx, word_count),
+            self.spin_receive(rx, word_count),
+        ))
+        .map_err(|err| {
+            self.recover_from_error();
+            err
+        })?;
 
-                self.wait_for_transmit_fifo_space()?;
-                self.enqueue_data(word.into());
-                transaction.continuing = true;
-                tx_idx += 1;
-            }
-
-            if rx_idx < buffer.len() {
-                self.recv_ok()?;
-                if let Some(word) = self.read_data() {
-                    buffer[rx_idx] = word.try_into().unwrap_or(W::MAX);
-                    rx_idx += 1;
-                }
-            }
-        }
+        self.flush()?;
 
         Ok(())
     }
 
-    /// Write data to the transmit queue without subsequently reading
-    /// the receive queue.
-    ///
-    /// Use this method when you know that the receiver queue is disabled
-    /// (RXMASK high in TCR).
-    ///
-    /// Similar to `exchange`, this is using continuous transfers for all supported primitives.
-    fn write_no_read<W>(&mut self, buffer: &[W]) -> Result<(), LpspiError>
-    where
-        W: Word,
-    {
-        if self.status().intersects(Status::BUSY) {
-            return Err(LpspiError::Busy);
-        } else if buffer.is_empty() {
-            return Err(LpspiError::NoData);
+    fn write_no_read<W: Word>(&mut self, data: &[W]) -> Result<(), LpspiError> {
+        if data.is_empty() {
+            return Ok(());
         }
 
-        self.clear_fifos();
-
-        let mut transaction = Transaction::new(8 * core::mem::size_of::<W>() as u16)?;
-        transaction.bit_order = self.bit_order();
-        transaction.continuous = true;
+        let mut transaction = Transaction::new_words(data)?;
         transaction.receive_data_mask = true;
+        transaction.bit_order = self.bit_order();
 
-        for word in buffer {
-            self.wait_for_transmit_fifo_space()?;
-            self.enqueue_transaction(&transaction);
+        self.wait_for_transmit_fifo_space()?;
+        self.enqueue_transaction(&transaction);
 
-            self.wait_for_transmit_fifo_space()?;
-            self.enqueue_data((*word).into());
-            transaction.continuing = true;
-        }
+        let word_count = word_count(data);
+        let tx = TransmitBuffer::new(data);
 
-        transaction.continuing = false;
-        transaction.continuous = false;
+        crate::spin_on(self.spin_transmit(tx, word_count)).map_err(|err| {
+            self.recover_from_error();
+            err
+        })?;
 
-        self.wait_for_transmit_fifo_space()?;
-        self.enqueue_transaction(&transaction);
+        self.flush()?;
 
         Ok(())
     }
@@ -913,6 +916,15 @@ impl<P, const N: u8> Lpspi<P, N> {
     pub fn set_watermark(&mut self, direction: Direction, watermark: u8) -> u8 {
         set_watermark(&self.lpspi, direction, watermark)
     }
+
+    /// Recover from a transaction error.
+    fn recover_from_error(&mut self) {
+        // Resets the peripheral and flushes whatever is in the FIFOs.
+        self.soft_reset();
+
+        // Reset the status flags, clearing the error condition for the next use.
+        self.clear_status(Status::TRANSMIT_ERROR | Status::RECEIVE_ERROR);
+    }
 }
 
 bitflags::bitflags! {
@@ -1199,22 +1211,462 @@ impl<P, const N: u8> eh02::blocking::spi::Write<u32> for Lpspi<P, N> {
 /// Describes SPI words that can participate in transactions.
 trait Word: Copy + Into<u32> + TryFrom<u32> {
     const MAX: Self;
+    const ZERO: Self;
+
+    /// Repeatedly call `provider` to produce yourself,
+    /// then turn yourself into a LPSPI word.
+    fn pack_word(bit_order: BitOrder, provider: impl FnMut() -> Option<Self>) -> u32;
+
+    /// Given a word, deconstruct the word and call the
+    /// `sink` with those components.
+    fn unpack_word(word: u32, sink: impl FnMut(Self));
 }
 
 impl Word for u8 {
     const MAX: u8 = u8::MAX;
+    const ZERO: u8 = 0;
+    fn pack_word(bit_order: BitOrder, mut provider: impl FnMut() -> Option<Self>) -> u32 {
+        let mut word = 0;
+        match bit_order {
+            BitOrder::Msb => {
+                for _ in 0..4 {
+                    if let Some(byte) = provider() {
+                        word <<= 8;
+                        word |= u32::from(byte);
+                    }
+                }
+            }
+            BitOrder::Lsb => {
+                for offset in 0..4 {
+                    if let Some(byte) = provider() {
+                        word |= u32::from(byte) << (8 * offset);
+                    }
+                }
+            }
+        }
+
+        word
+    }
+    fn unpack_word(word: u32, mut sink: impl FnMut(Self)) {
+        for offset in [0, 8, 16, 24] {
+            sink((word >> offset) as u8);
+        }
+    }
 }
 
 impl Word for u16 {
     const MAX: u16 = u16::MAX;
+    const ZERO: u16 = 0;
+    fn pack_word(bit_order: BitOrder, mut provider: impl FnMut() -> Option<Self>) -> u32 {
+        let mut word = 0;
+        match bit_order {
+            BitOrder::Msb => {
+                for _ in 0..2 {
+                    if let Some(half) = provider() {
+                        word <<= 16;
+                        word |= u32::from(half);
+                    }
+                }
+            }
+            BitOrder::Lsb => {
+                for offset in 0..2 {
+                    if let Some(half) = provider() {
+                        word |= u32::from(half) << (16 * offset);
+                    }
+                }
+            }
+        }
+
+        word
+    }
+    fn unpack_word(word: u32, mut sink: impl FnMut(Self)) {
+        for offset in [0, 16] {
+            sink((word >> offset) as u16);
+        }
+    }
 }
 
 impl Word for u32 {
     const MAX: u32 = u32::MAX;
+    const ZERO: u32 = 0;
+    fn pack_word(_: BitOrder, mut provider: impl FnMut() -> Option<Self>) -> u32 {
+        provider().unwrap_or(0)
+    }
+    fn unpack_word(word: u32, mut sink: impl FnMut(Self)) {
+        sink(word)
+    }
+}
+
+/// Generalizes how we prepare LPSPI words for transmit.
+trait TransmitData {
+    /// Get the next word for the transmit FIFO.
+    ///
+    /// If you're out of words, return 0.
+    fn next_word(&mut self, bit_order: BitOrder) -> u32;
+}
+
+/// Generalizes how we save LPSPI data into memory.
+trait ReceiveData {
+    /// Invoked each time we read data from the queue.
+    fn next_word(&mut self, word: u32);
+}
+
+/// Transmit data from a buffer.
+struct TransmitBuffer<'a, W> {
+    /// The read position.
+    ptr: *const W,
+    /// One past the end of the buffer.
+    end: *const W,
+    _buffer: PhantomData<&'a [W]>,
+}
+
+impl<'a, W> TransmitBuffer<'a, W>
+where
+    W: Word,
+{
+    fn new(buffer: &'a [W]) -> Self {
+        // Safety: pointer offset math meets expectations.
+        unsafe { Self::from_raw(buffer.as_ptr(), buffer.len()) }
+    }
+
+    /// # Safety
+    ///
+    /// `ptr + len` must be in bounds, or one past the end of the
+    /// allocation.
+    unsafe fn from_raw(ptr: *const W, len: usize) -> Self {
+        Self {
+            ptr,
+            end: unsafe { ptr.add(len) },
+            _buffer: PhantomData,
+        }
+    }
+
+    /// Read the next element from the buffer.
+    fn next_read(&mut self) -> Option<W> {
+        // Safety: read the next word only if we're in bounds.
+        unsafe {
+            (self.ptr != self.end).then(|| {
+                let word = self.ptr.read();
+                self.ptr = self.ptr.add(1);
+                word
+            })
+        }
+    }
+}
+
+impl<W> TransmitData for TransmitBuffer<'_, W>
+where
+    W: Word,
+{
+    fn next_word(&mut self, bit_order: BitOrder) -> u32 {
+        W::pack_word(bit_order, || self.next_read())
+    }
+}
+
+/// Transmits dummy values.
+struct TransmitDummies;
+
+impl TransmitData for TransmitDummies {
+    fn next_word(&mut self, _: BitOrder) -> u32 {
+        u32::MAX
+    }
+}
+
+/// Receive data into a buffer.
+struct ReceiveBuffer<'a, W> {
+    /// The write position.
+    ptr: *mut W,
+    /// One past the end of the buffer.
+    end: *const W,
+    _buffer: PhantomData<&'a [W]>,
 }
 
+impl<'a, W> ReceiveBuffer<'a, W>
+where
+    W: Word,
+{
+    #[cfg(test)] // TODO(mciantyre) remove once needed in non-test code.
+    fn new(buffer: &'a mut [W]) -> Self {
+        // Safety: pointer offset math meets expectations.
+        unsafe { Self::from_raw(buffer.as_mut_ptr(), buffer.len()) }
+    }
+
+    /// # Safety
+    ///
+    /// `ptr + len` must be in bounds, or one past the end of the
+    /// allocation.
+    unsafe fn from_raw(ptr: *mut W, len: usize) -> Self {
+        Self {
+            ptr,
+            end: unsafe { ptr.cast_const().add(len) },
+            _buffer: PhantomData,
+        }
+    }
+
+    /// Put the next element into the buffer.
+    fn next_write(&mut self, elem: W) {
+        // Safety: write the next word only if we're in bounds.
+        // Words are primitive types; we don't need to execute
+        // a drop when we overwrite a value in memory.
+        unsafe {
+            if self.ptr.cast_const() != self.end {
+                self.ptr.write(elem);
+                self.ptr = self.ptr.add(1);
+            }
+        }
+    }
+}
+
+impl<W> ReceiveData for ReceiveBuffer<'_, W>
+where
+    W: Word,
+{
+    fn next_word(&mut self, word: u32) {
+        W::unpack_word(word, |elem| self.next_write(elem));
+    }
+}
+
+/// Receive dummy data.
+struct ReceiveDummies;
+
+impl ReceiveData for ReceiveDummies {
+    fn next_word(&mut self, _: u32) {}
+}
+
+/// Computes how may Ws fit inside a LPSPI word.
+const fn per_word<W: Word>() -> usize {
+    core::mem::size_of::<u32>() / core::mem::size_of::<W>()
+}
+
+/// Computes how many u32 words we need to transact this buffer.
+const fn word_count<W: Word>(words: &[W]) -> usize {
+    (words.len() + per_word::<W>() - 1) / per_word::<W>()
+}
+
+/// Creates the transmit and receive buffer objects for an
+/// in-place transfer.
+fn transfer_in_place<W: Word>(buffer: &mut [W]) -> (TransmitBuffer<'_, W>, ReceiveBuffer<'_, W>) {
+    // Safety: pointer math meets expectation. This produces
+    // a mutable and immutable pointer to the same mutable buffer.
+    // Module inspection shows that these pointers never become
+    // references. We maintain the lifetime across both objects,
+    // so the buffer isn't dropped.
+    unsafe {
+        let len = buffer.len();
+        let ptr = buffer.as_mut_ptr();
+        (
+            TransmitBuffer::from_raw(ptr, len),
+            ReceiveBuffer::from_raw(ptr, len),
+        )
+    }
+}
+
+/// Tests try to approximate the way we'll use TransmitBuffer and ReceiveBuffer
+/// in firmware. Consider running these with miri to evaluate unsafe usages.
 #[cfg(test)]
 mod tests {
+    #[test]
+    fn transfer_in_place_interleaved_read_write_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            assert_eq!(elem, tx.next_read().unwrap());
+            rx.next_write(elem + 1);
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transfer_in_place_interleaved_write_read_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            rx.next_write(elem + 1);
+            assert_eq!(elem + 1, tx.next_read().unwrap());
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transfer_in_place_bulk_read_write_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            assert_eq!(elem, tx.next_read().unwrap());
+        }
+        for elem in BUFFER {
+            rx.next_write(elem + 1);
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transfer_in_place_bulk_write_read_u32() {
+        const BUFFER: [u32; 9] = [42u32, 43, 44, 45, 46, 47, 48, 49, 50];
+        let mut buffer = BUFFER;
+        let (mut tx, mut rx) = super::transfer_in_place(&mut buffer);
+
+        for elem in BUFFER {
+            rx.next_write(elem + 1);
+        }
+        for elem in BUFFER {
+            assert_eq!(elem + 1, tx.next_read().unwrap());
+        }
+
+        assert_eq!(buffer, [43, 44, 45, 46, 47, 48, 49, 50, 51]);
+    }
+
+    #[test]
+    fn transmit_buffer() {
+        use super::{BitOrder::*, TransmitBuffer, TransmitData};
+
+        //
+        // u32
+        //
+        // This is the easiest to understand w.r.t. the bit order, since this is the natural word
+        // size of the peripheral. No matter the bit order, we produce the same word for the TX
+        // FIFO. The hardware handles the MSB or LSB transform.
+
+        let mut tx = TransmitBuffer::new(&[0xDEADBEEFu32, 0xAD1CAC1D]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0xAD1CAC1D);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADBEEFu32, 0xAD1CAC1D]);
+        assert_eq!(tx.next_word(Lsb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Lsb), 0xAD1CAC1D);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        //
+        // u8
+        //
+        // If the user prefers u8 words, then we should pack the bytes into a u32 such that the
+        // hardware's MSB/LSB transform maintains the (literal) byte order.
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF, 0xA5, 0x00, 0x1D]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0x00A5001D);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF, 0xA5, 0x00, 0x1D]);
+        assert_eq!(tx.next_word(Lsb), 0xEFBEADDE);
+        assert_eq!(tx.next_word(Lsb), 0x001D00A5);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE, 0xEF]);
+        assert_eq!(tx.next_word(Lsb), 0xEFBEADDE);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE]);
+        assert_eq!(tx.next_word(Msb), 0x00DEADBE);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEu8, 0xAD, 0xBE]);
+        assert_eq!(tx.next_word(Lsb), 0x00BEADDE);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        //
+        // u16
+        //
+        // Same goes here: we should combine u16s such that the hardware transfers elements
+        // in order while applying the MSB/LSB transform on each u16.
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF, 0xA5A5]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0x0000A5A5);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF, 0xA5A5]);
+        assert_eq!(tx.next_word(Lsb), 0xBEEFDEAD);
+        assert_eq!(tx.next_word(Lsb), 0x0000A5A5);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF]);
+        assert_eq!(tx.next_word(Msb), 0xDEADBEEF);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16, 0xBEEF]);
+        assert_eq!(tx.next_word(Lsb), 0xBEEFDEAD);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16]);
+        assert_eq!(tx.next_word(Msb), 0x0000DEAD);
+        assert_eq!(tx.next_word(Msb), 0);
+        assert_eq!(tx.next_word(Msb), 0);
+
+        let mut tx = TransmitBuffer::new(&[0xDEADu16]);
+        assert_eq!(tx.next_word(Lsb), 0x0000DEAD);
+        assert_eq!(tx.next_word(Lsb), 0);
+        assert_eq!(tx.next_word(Lsb), 0);
+    }
+
+    #[test]
+    fn receive_buffer() {
+        use super::{ReceiveBuffer, ReceiveData};
+
+        //
+        // u8
+        //
+
+        let mut buffer = [0u8; 9];
+        let mut rx = ReceiveBuffer::new(&mut buffer);
+        rx.next_word(0xDEADBEEF);
+        rx.next_word(0xAD1CAC1D);
+        rx.next_word(0x04030201);
+        rx.next_word(0x55555555);
+        assert_eq!(
+            buffer,
+            [0xEF, 0xBE, 0xAD, 0xDE, 0x1D, 0xAC, 0x1C, 0xAD, 0x01]
+        );
+
+        //
+        // u16
+        //
+
+        let mut buffer = [0u16; 5];
+        let mut rx = ReceiveBuffer::new(&mut buffer);
+        rx.next_word(0xDEADBEEF);
+        rx.next_word(0xAD1CAC1D);
+        rx.next_word(0x04030201);
+        rx.next_word(0x55555555);
+        assert_eq!(buffer, [0xBEEF, 0xDEAD, 0xAC1D, 0xAD1C, 0x0201]);
+
+        //
+        // u32
+        //
+
+        let mut buffer = [0u32; 3];
+        let mut rx = ReceiveBuffer::new(&mut buffer);
+        rx.next_word(0xDEADBEEF);
+        rx.next_word(0xAD1CAC1D);
+        rx.next_word(0x77777777);
+        rx.next_word(0x55555555);
+        assert_eq!(buffer, [0xDEADBEEF, 0xAD1CAC1D, 0x77777777]);
+    }
+
     #[test]
     fn transaction_frame_sizes() {
         assert!(super::Transaction::new_words(&[1u8]).is_ok());
diff --git a/src/lib.rs b/src/lib.rs
index d0993d63..e9caa0ea 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -295,3 +295,18 @@ pub mod iomuxc {
 
 #[cfg_attr(family = "none", allow(unused_imports))] // Nothing to export in this build.
 pub use crate::chip::reexports::*;
+
+/// Simply spin on the future.
+fn spin_on<F: core::future::Future>(future: F) -> F::Output {
+    use core::task::{Context, Poll};
+
+    let waker = futures::task::noop_waker();
+    let mut context = Context::from_waker(&waker);
+    let mut future = core::pin::pin!(future);
+
+    loop {
+        if let Poll::Ready(result) = future.as_mut().poll(&mut context) {
+            return result;
+        }
+    }
+}