diff --git a/src/classes.rs b/src/classes.rs
new file mode 100644
index 0000000..143908b
--- /dev/null
+++ b/src/classes.rs
@@ -0,0 +1,271 @@
+use core::fmt;
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in a DFA to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of a dense DFA,
+/// but also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+    /// Creates a new set of equivalence classes where all bytes are mapped to
+    /// the same class.
+    pub fn empty() -> ByteClasses {
+        ByteClasses([0; 256])
+    }
+
+    /// Creates a new set of equivalence classes where each byte belongs to
+    /// its own equivalence class.
+    pub fn singletons() -> ByteClasses {
+        let mut classes = ByteClasses::empty();
+        for i in 0..256 {
+            classes.set(i as u8, i as u8);
+        }
+        classes
+    }
+
+    /// Copies the byte classes given. The given slice must have length 0 or
+    /// length 256. Slices of length 0 are treated as singletons (every byte
+    /// is its own class).
+    pub fn from_slice(slice: &[u8]) -> ByteClasses {
+        assert!(slice.is_empty() || slice.len() == 256);
+
+        if slice.is_empty() {
+            ByteClasses::singletons()
+        } else {
+            let mut classes = ByteClasses::empty();
+            for (b, &class) in slice.iter().enumerate() {
+                classes.set(b as u8, class);
+            }
+            classes
+        }
+    }
+
+    /// Set the equivalence class for the given byte.
+    #[inline]
+    pub fn set(&mut self, byte: u8, class: u8) {
+        self.0[byte as usize] = class;
+    }
+
+    /// Get the equivalence class for the given byte.
+    #[inline]
+    pub fn get(&self, byte: u8) -> u8 {
+        self.0[byte as usize]
+    }
+
+    /// Get the equivalence class for the given byte while forcefully
+    /// eliding bounds checks.
+    #[inline]
+    pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
+        *self.0.get_unchecked(byte as usize)
+    }
+
+    /// Return the total number of elements in the alphabet represented by
+    /// these equivalence classes. Equivalently, this returns the total number
+    /// of equivalence classes.
+    #[inline]
+    pub fn alphabet_len(&self) -> usize {
+        self.0[255] as usize + 1
+    }
+
+    /// Returns true if and only if every byte in this class maps to its own
+    /// equivalence class. Equivalently, there are 256 equivalence classes
+    /// and each class contains exactly one byte.
+    #[inline]
+    pub fn is_singleton(&self) -> bool {
+        self.alphabet_len() == 256
+    }
+
+    /// Returns an iterator over a sequence of representative bytes from each
+    /// equivalence class. Namely, this yields exactly N items, where N is
+    /// equivalent to the number of equivalence classes. Each item is an
+    /// arbitrary byte drawn from each equivalence class.
+    ///
+    /// This is useful when one is determinizing an NFA and the NFA's alphabet
+    /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+    /// byte from each equivalence class then permits a full exploration of
+    /// the NFA instead of using every possible byte value.
+    #[cfg(feature = "std")]
+    pub fn representatives(&self) -> ByteClassRepresentatives {
+        ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+    }
+
+    /// Returns all of the bytes in the given equivalence class.
+    ///
+    /// The second element in the tuple indicates the number of elements in
+    /// the array.
+    fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
+        let (mut array, mut len) = ([0; 256], 0);
+        for b in 0..256 {
+            if self.get(b as u8) == equiv {
+                array[len] = b as u8;
+                len += 1;
+            }
+        }
+        (array, len)
+    }
+}
+
+impl fmt::Debug for ByteClasses {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if self.is_singleton() {
+            write!(f, "ByteClasses({{singletons}})")
+        } else {
+            write!(f, "ByteClasses(")?;
+            for equiv in 0..self.alphabet_len() {
+                let (members, len) = self.elements(equiv as u8);
+                write!(f, "{} => {:?}", equiv, &members[..len])?;
+            }
+            write!(f, ")")
+        }
+    }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+    classes: &'a ByteClasses,
+    byte: usize,
+    last_class: Option<u8>,
+}
+
+#[cfg(feature = "std")]
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+    type Item = u8;
+
+    fn next(&mut self) -> Option<u8> {
+        while self.byte < 256 {
+            let byte = self.byte as u8;
+            let class = self.classes.get(byte);
+            self.byte += 1;
+
+            if self.last_class != Some(class) {
+                self.last_class = Some(class);
+                return Some(byte);
+            }
+        }
+        None
+    }
+}
+
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
+/// same equivalence class because it never matters whether an `a` or a `b` is
+/// seen, and no combination of `a`s and `b`s in the text can discriminate
+/// a match.
+///
+/// Note though that this does not compute the minimal set of equivalence
+/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
+/// same equivalence class for the same reason that `a` and `b` are in the
+/// same equivalence class in the aforementioned regex. However, in this
+/// implementation, `a` and `c` are put into distinct equivalence classes.
+/// The reason for this is implementation complexity. In the future, we should
+/// endeavor to compute the minimal equivalence classes since they can have a
+/// rather large impact on the size of the DFA.
+///
+/// The representation here is 256 booleans, all initially set to false. Each
+/// boolean maps to its corresponding byte based on position. A `true` value
+/// indicates the end of an equivalence class, where its corresponding byte
+/// and all of the bytes corresponding to all previous contiguous `false`
+/// values are in the same equivalence class.
+///
+/// This particular representation only permits contiguous ranges of bytes to
+/// be in the same equivalence class, which means that we can never discover
+/// the true minimal set of equivalence classes.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub struct ByteClassSet(Vec<bool>);
+
+#[cfg(feature = "std")]
+impl ByteClassSet {
+    /// Create a new set of byte classes where all bytes are part of the same
+    /// equivalence class.
+    pub fn new() -> Self {
+        ByteClassSet(vec![false; 256])
+    }
+
+    /// Indicate the the range of byte given (inclusive) can discriminate a
+    /// match between it and all other bytes outside of the range.
+    pub fn set_range(&mut self, start: u8, end: u8) {
+        debug_assert!(start <= end);
+        if start > 0 {
+            self.0[start as usize - 1] = true;
+        }
+        self.0[end as usize] = true;
+    }
+
+    /// Convert this boolean set to a map that maps all byte values to their
+    /// corresponding equivalence class. The last mapping indicates the largest
+    /// equivalence class identifier (which is never bigger than 255).
+    pub fn byte_classes(&self) -> ByteClasses {
+        let mut classes = ByteClasses::empty();
+        let mut class = 0u8;
+        let mut i = 0;
+        loop {
+            classes.set(i as u8, class as u8);
+            if i >= 255 {
+                break;
+            }
+            if self.0[i] {
+                class = class.checked_add(1).unwrap();
+            }
+            i += 1;
+        }
+        classes
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(feature = "std")]
+    #[test]
+    fn byte_classes() {
+        use super::ByteClassSet;
+
+        let mut set = ByteClassSet::new();
+        set.set_range(b'a', b'z');
+
+        let classes = set.byte_classes();
+        assert_eq!(classes.get(0), 0);
+        assert_eq!(classes.get(1), 0);
+        assert_eq!(classes.get(2), 0);
+        assert_eq!(classes.get(b'a' - 1), 0);
+        assert_eq!(classes.get(b'a'), 1);
+        assert_eq!(classes.get(b'm'), 1);
+        assert_eq!(classes.get(b'z'), 1);
+        assert_eq!(classes.get(b'z' + 1), 2);
+        assert_eq!(classes.get(254), 2);
+        assert_eq!(classes.get(255), 2);
+
+        let mut set = ByteClassSet::new();
+        set.set_range(0, 2);
+        set.set_range(4, 6);
+        let classes = set.byte_classes();
+        assert_eq!(classes.get(0), 0);
+        assert_eq!(classes.get(1), 0);
+        assert_eq!(classes.get(2), 0);
+        assert_eq!(classes.get(3), 1);
+        assert_eq!(classes.get(4), 2);
+        assert_eq!(classes.get(5), 2);
+        assert_eq!(classes.get(6), 2);
+        assert_eq!(classes.get(7), 3);
+        assert_eq!(classes.get(255), 3);
+    }
+
+    #[cfg(feature = "std")]
+    #[test]
+    fn full_byte_classes() {
+        use super::ByteClassSet;
+
+        let mut set = ByteClassSet::new();
+        for i in 0..256u16 {
+            set.set_range(i as u8, i as u8);
+        }
+        assert_eq!(set.byte_classes().alphabet_len(), 256);
+    }
+}
diff --git a/src/codegen.rs b/src/codegen.rs
new file mode 100644
index 0000000..b2aacbb
--- /dev/null
+++ b/src/codegen.rs
@@ -0,0 +1,104 @@
+// This module is unused. It was written as an experiment to get a ballpark
+// idea of what state machines look like when translated to Rust code, and
+// in particular, an idea of how much code it generates. The implementation
+// below isn't optimal with respect to size, but the result wasn't exactly
+// small. At some point, we should pursue building this out beyond
+// experimentation, and in particular, probably provide a command line tool
+// and/or a macro. It's a fair bit of work, so I abandoned it for the initial
+// release. ---AG
+
+use std::collections::HashMap;
+use std::io::Write;
+
+use dense::DFA;
+use state_id::StateID;
+
+macro_rules! wstr {
+    ($($tt:tt)*) => { write!($($tt)*).unwrap() }
+}
+
+macro_rules! wstrln {
+    ($($tt:tt)*) => { writeln!($($tt)*).unwrap() }
+}
+
+pub fn is_match_forward<S: StateID>(dfa: &DFA<S>) -> String {
+    let names = state_variant_names(dfa);
+
+    let mut buf = vec![];
+    wstrln!(buf, "pub fn is_match(input: &[u8])  -> bool {{");
+    if dfa.is_match_state(dfa.start()) {
+        wstrln!(buf, "    return true;");
+        wstrln!(buf, "}}");
+        return String::from_utf8(buf).unwrap();
+    }
+
+    wstrln!(buf, "{}", state_enum_def(dfa, &names));
+
+    wstrln!(buf, "    let mut state = {};", names[&dfa.start()]);
+    wstrln!(buf, "    for &b in input.iter() {{");
+    wstrln!(buf, "        state = match state {{");
+    for (id, s) in dfa.iter() {
+        if dfa.is_match_state(id) {
+            continue;
+        }
+
+        wstrln!(buf, "            {} => {{", &names[&id]);
+        wstrln!(buf, "                match b {{");
+        for (start, end, next_id) in s.sparse_transitions() {
+            if dfa.is_match_state(next_id) {
+                wstrln!(buf, "                    {:?}...{:?} => return true,", start, end);
+            } else {
+                if start == end {
+                    wstrln!(buf, "                    {:?} => {},", start, &names[&next_id]);
+                } else {
+                    wstrln!(buf, "                    {:?}...{:?} => {},", start, end, &names[&next_id]);
+                }
+            }
+        }
+        wstrln!(buf, "                    _ => S::S0,");
+        wstrln!(buf, "                }}");
+        wstrln!(buf, "            }}");
+    }
+    wstrln!(buf, "        }};");
+    wstrln!(buf, "    }}");
+
+    wstrln!(buf, "    false");
+    wstrln!(buf, "}}");
+    String::from_utf8(buf).unwrap()
+}
+
+fn state_enum_def<S: StateID>(
+    dfa: &DFA<S>,
+    variant_names: &HashMap<S, String>,
+) -> String {
+    let mut buf = vec![];
+    wstrln!(buf, "    #[derive(Clone, Copy)]");
+    wstr!(buf, "    enum S {{");
+
+    let mut i = 0;
+    for (id, _) in dfa.iter() {
+        if dfa.is_match_state(id) {
+            continue;
+        }
+        if i % 10 == 0 {
+            wstr!(buf, "\n       ");
+        }
+        let name = format!("S{}", id.to_usize());
+        wstr!(buf, " {},", name);
+        i += 1;
+    }
+    wstr!(buf, "\n");
+    wstrln!(buf, "    }}");
+    String::from_utf8(buf).unwrap()
+}
+
+fn state_variant_names<S: StateID>(dfa: &DFA<S>) -> HashMap<S, String> {
+    let mut variants = HashMap::new();
+    for (id, _) in dfa.iter() {
+        if dfa.is_match_state(id) {
+            continue;
+        }
+        variants.insert(id, format!("S::S{}", id.to_usize()));
+    }
+    variants
+}
diff --git a/src/dense.rs b/src/dense.rs
new file mode 100644
index 0000000..ed4d1b6
--- /dev/null
+++ b/src/dense.rs
@@ -0,0 +1,2332 @@
+#[cfg(feature = "std")]
+use core::fmt;
+#[cfg(feature = "std")]
+use core::iter;
+use core::mem;
+use core::slice;
+
+#[cfg(feature = "std")]
+use byteorder::{BigEndian, LittleEndian};
+use byteorder::{ByteOrder, NativeEndian};
+#[cfg(feature = "std")]
+use regex_syntax::ParserBuilder;
+
+use classes::ByteClasses;
+#[cfg(feature = "std")]
+use determinize::Determinizer;
+use dfa::DFA;
+#[cfg(feature = "std")]
+use error::{Error, Result};
+#[cfg(feature = "std")]
+use minimize::Minimizer;
+#[cfg(feature = "std")]
+use nfa::{self, NFA};
+#[cfg(feature = "std")]
+use sparse::SparseDFA;
+use state_id::{dead_id, StateID};
+#[cfg(feature = "std")]
+use state_id::{
+    next_state_id, premultiply_overflow_error, write_state_id_bytes,
+};
+
+/// The size of the alphabet in a standard DFA.
+///
+/// Specifically, this length controls the number of transitions present in
+/// each DFA state. However, when the byte class optimization is enabled,
+/// then each DFA maps the space of all possible 256 byte values to at most
+/// 256 distinct equivalence classes. In this case, the number of distinct
+/// equivalence classes corresponds to the internal alphabet of the DFA, in the
+/// sense that each DFA state has a number of transitions equal to the number
+/// of equivalence classes despite supporting matching on all possible byte
+/// values.
+const ALPHABET_LEN: usize = 256;
+
+/// Masks used in serialization of DFAs.
+pub(crate) const MASK_PREMULTIPLIED: u16 = 0b0000_0000_0000_0001;
+pub(crate) const MASK_ANCHORED: u16 = 0b0000_0000_0000_0010;
+
+/// A dense table-based deterministic finite automaton (DFA).
+///
+/// A dense DFA represents the core matching primitive in this crate. That is,
+/// logically, all DFAs have a single start state, one or more match states
+/// and a transition table that maps the current state and the current byte of
+/// input to the next state. A DFA can use this information to implement fast
+/// searching. In particular, the use of a dense DFA generally makes the trade
+/// off that match speed is the most valuable characteristic, even if building
+/// the regex may take significant time *and* space. As such, the processing
+/// of every byte of input is done with a small constant number of operations
+/// that does not vary with the pattern, its size or the size of the alphabet.
+/// If your needs don't line up with this trade off, then a dense DFA may not
+/// be an adequate solution to your problem.
+///
+/// In contrast, a [sparse DFA](enum.SparseDFA.html) makes the opposite
+/// trade off: it uses less space but will execute a variable number of
+/// instructions per byte at match time, which makes it slower for matching.
+///
+/// A DFA can be built using the default configuration via the
+/// [`DenseDFA::new`](enum.DenseDFA.html#method.new) constructor. Otherwise,
+/// one can configure various aspects via the
+/// [`dense::Builder`](dense/struct.Builder.html).
+///
+/// A single DFA fundamentally supports the following operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of the first possible match.
+/// 3. Location of the end of the leftmost-first match.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of a
+/// match, *two* DFAs are required. This functionality is provided by a
+/// [`Regex`](struct.Regex.html), which can be built with its basic
+/// constructor, [`Regex::new`](struct.Regex.html#method.new), or with
+/// a [`RegexBuilder`](struct.RegexBuilder.html).
+///
+/// # State size
+///
+/// A `DenseDFA` has two type parameters, `T` and `S`. `T` corresponds to
+/// the type of the DFA's transition table while `S` corresponds to the
+/// representation used for the DFA's state identifiers as described by the
+/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
+/// `usize`, but other valid choices provided by this crate include `u8`,
+/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
+/// identifier representation than the default is to reduce the amount of
+/// memory used by a DFA. Note though, that if the chosen representation cannot
+/// accommodate the size of your DFA, then building the DFA will fail and
+/// return an error.
+///
+/// While the reduction in heap memory used by a DFA is one reason for choosing
+/// a smaller state identifier representation, another possible reason is for
+/// decreasing the serialization size of a DFA, as returned by
+/// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian),
+/// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
+/// or
+/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
+///
+/// The type of the transition table is typically either `Vec<S>` or `&[S]`,
+/// depending on where the transition table is stored.
+///
+/// # Variants
+///
+/// This DFA is defined as a non-exhaustive enumeration of different types of
+/// dense DFAs. All of these dense DFAs use the same internal representation
+/// for the transition table, but they vary in how the transition table is
+/// read. A DFA's specific variant depends on the configuration options set via
+/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
+/// `PremultipliedByteClass`.
+///
+/// # The `DFA` trait
+///
+/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
+/// can be used for searching. For example:
+///
+/// ```
+/// use regex_automata::{DFA, DenseDFA};
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// let dfa = DenseDFA::new("foo[0-9]+")?;
+/// assert_eq!(Some(8), dfa.find(b"foo12345"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+///
+/// The `DFA` trait also provides an assortment of other lower level methods
+/// for DFAs, such as `start_state` and `next_state`. While these are correctly
+/// implemented, it is an anti-pattern to use them in performance sensitive
+/// code on the `DenseDFA` type directly. Namely, each implementation requires
+/// a branch to determine which type of dense DFA is being used. Instead,
+/// this branch should be pushed up a layer in the code since walking the
+/// transitions of a DFA is usually a hot path. If you do need to use these
+/// lower level methods in performance critical code, then you should match on
+/// the variants of this DFA and use each variant's implementation of the `DFA`
+/// trait directly.
+#[derive(Clone, Debug)]
+pub enum DenseDFA<T: AsRef<[S]>, S: StateID> {
+    /// A standard DFA that does not use premultiplication or byte classes.
+    Standard(Standard<T, S>),
+    /// A DFA that shrinks its alphabet to a set of equivalence classes instead
+    /// of using all possible byte values. Any two bytes belong to the same
+    /// equivalence class if and only if they can be used interchangeably
+    /// anywhere in the DFA while never discriminating between a match and a
+    /// non-match.
+    ///
+    /// This type of DFA can result in significant space reduction with a very
+    /// small match time performance penalty.
+    ByteClass(ByteClass<T, S>),
+    /// A DFA that premultiplies all of its state identifiers in its
+    /// transition table. This saves an instruction per byte at match time
+    /// which improves search performance.
+    ///
+    /// The only downside of premultiplication is that it may prevent one from
+    /// using a smaller state identifier representation than you otherwise
+    /// could.
+    Premultiplied(Premultiplied<T, S>),
+    /// The default configuration of a DFA, which uses byte classes and
+    /// premultiplies its state identifiers.
+    PremultipliedByteClass(PremultipliedByteClass<T, S>),
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
+    /// Return the internal DFA representation.
+    ///
+    /// All variants share the same internal representation.
+    fn repr(&self) -> &Repr<T, S> {
+        match *self {
+            DenseDFA::Standard(ref r) => &r.0,
+            DenseDFA::ByteClass(ref r) => &r.0,
+            DenseDFA::Premultiplied(ref r) => &r.0,
+            DenseDFA::PremultipliedByteClass(ref r) => &r.0,
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+#[cfg(feature = "std")]
+impl DenseDFA<Vec<usize>, usize> {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding DFA.
+    ///
+    /// The default configuration uses `usize` for state IDs, premultiplies
+    /// them and reduces the alphabet size by splitting bytes into equivalence
+    /// classes. The DFA is *not* minimized.
+    ///
+    /// If you want a non-default configuration, then use the
+    /// [`dense::Builder`](dense/struct.Builder.html)
+    /// to set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
+    /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn new(pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
+        Builder::new().build(pattern)
+    }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> DenseDFA<Vec<S>, S> {
+    /// Create a new empty DFA that never matches any input.
+    ///
+    /// # Example
+    ///
+    /// In order to build an empty DFA, callers must provide a type hint
+    /// indicating their choice of state identifier representation.
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa: DenseDFA<Vec<usize>, usize> = DenseDFA::empty();
+    /// assert_eq!(None, dfa.find(b""));
+    /// assert_eq!(None, dfa.find(b"foo"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn empty() -> DenseDFA<Vec<S>, S> {
+        Repr::empty().into_dense_dfa()
+    }
+}
+
+impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
+    /// Cheaply return a borrowed version of this dense DFA. Specifically, the
+    /// DFA returned always uses `&[S]` for its transition table while keeping
+    /// the same state identifier representation.
+    pub fn as_ref<'a>(&'a self) -> DenseDFA<&'a [S], S> {
+        match *self {
+            DenseDFA::Standard(ref r) => {
+                DenseDFA::Standard(Standard(r.0.as_ref()))
+            }
+            DenseDFA::ByteClass(ref r) => {
+                DenseDFA::ByteClass(ByteClass(r.0.as_ref()))
+            }
+            DenseDFA::Premultiplied(ref r) => {
+                DenseDFA::Premultiplied(Premultiplied(r.0.as_ref()))
+            }
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                let inner = PremultipliedByteClass(r.0.as_ref());
+                DenseDFA::PremultipliedByteClass(inner)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    /// Return an owned version of this sparse DFA. Specifically, the DFA
+    /// returned always uses `Vec<u8>` for its transition table while keeping
+    /// the same state identifier representation.
+    ///
+    /// Effectively, this returns a sparse DFA whose transition table lives
+    /// on the heap.
+    #[cfg(feature = "std")]
+    pub fn to_owned(&self) -> DenseDFA<Vec<S>, S> {
+        match *self {
+            DenseDFA::Standard(ref r) => {
+                DenseDFA::Standard(Standard(r.0.to_owned()))
+            }
+            DenseDFA::ByteClass(ref r) => {
+                DenseDFA::ByteClass(ByteClass(r.0.to_owned()))
+            }
+            DenseDFA::Premultiplied(ref r) => {
+                DenseDFA::Premultiplied(Premultiplied(r.0.to_owned()))
+            }
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                let inner = PremultipliedByteClass(r.0.to_owned());
+                DenseDFA::PremultipliedByteClass(inner)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    /// Returns the memory usage, in bytes, of this DFA.
+    ///
+    /// The memory usage is computed based on the number of bytes used to
+    /// represent this DFA's transition table. This corresponds to heap memory
+    /// usage.
+    ///
+    /// This does **not** include the stack size used up by this DFA. To
+    /// compute that, used `std::mem::size_of::<DenseDFA>()`.
+    pub fn memory_usage(&self) -> usize {
+        self.repr().memory_usage()
+    }
+}
+
+/// Routines for converting a dense DFA to other representations, such as
+/// sparse DFAs, smaller state identifiers or raw bytes suitable for persistent
+/// storage.
+#[cfg(feature = "std")]
+impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
+    /// Convert this dense DFA to a sparse DFA.
+    ///
+    /// This is a convenience routine for `to_sparse_sized` that fixes the
+    /// state identifier representation of the sparse DFA to the same
+    /// representation used for this dense DFA.
+    ///
+    /// If the chosen state identifier representation is too small to represent
+    /// all states in the sparse DFA, then this returns an error. In most
+    /// cases, if a dense DFA is constructable with `S` then a sparse DFA will
+    /// be as well. However, it is not guaranteed.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dense = DenseDFA::new("foo[0-9]+")?;
+    /// let sparse = dense.to_sparse()?;
+    /// assert_eq!(Some(8), sparse.find(b"foo12345"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn to_sparse(&self) -> Result<SparseDFA<Vec<u8>, S>> {
+        self.to_sparse_sized()
+    }
+
+    /// Convert this dense DFA to a sparse DFA.
+    ///
+    /// Using this routine requires supplying a type hint to choose the state
+    /// identifier representation for the resulting sparse DFA.
+    ///
+    /// If the chosen state identifier representation is too small to represent
+    /// all states in the sparse DFA, then this returns an error.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dense = DenseDFA::new("foo[0-9]+")?;
+    /// let sparse = dense.to_sparse_sized::<u8>()?;
+    /// assert_eq!(Some(8), sparse.find(b"foo12345"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn to_sparse_sized<A: StateID>(
+        &self,
+    ) -> Result<SparseDFA<Vec<u8>, A>> {
+        self.repr().to_sparse_sized()
+    }
+
+    /// Create a new DFA whose match semantics are equivalent to this DFA,
+    /// but attempt to use `u8` for the representation of state identifiers.
+    /// If `u8` is insufficient to represent all state identifiers in this
+    /// DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u8>()`.
+    pub fn to_u8(&self) -> Result<DenseDFA<Vec<u8>, u8>> {
+        self.to_sized()
+    }
+
+    /// Create a new DFA whose match semantics are equivalent to this DFA,
+    /// but attempt to use `u16` for the representation of state identifiers.
+    /// If `u16` is insufficient to represent all state identifiers in this
+    /// DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u16>()`.
+    pub fn to_u16(&self) -> Result<DenseDFA<Vec<u16>, u16>> {
+        self.to_sized()
+    }
+
+    /// Create a new DFA whose match semantics are equivalent to this DFA,
+    /// but attempt to use `u32` for the representation of state identifiers.
+    /// If `u32` is insufficient to represent all state identifiers in this
+    /// DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u32>()`.
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    pub fn to_u32(&self) -> Result<DenseDFA<Vec<u32>, u32>> {
+        self.to_sized()
+    }
+
+    /// Create a new DFA whose match semantics are equivalent to this DFA,
+    /// but attempt to use `u64` for the representation of state identifiers.
+    /// If `u64` is insufficient to represent all state identifiers in this
+    /// DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u64>()`.
+    #[cfg(target_pointer_width = "64")]
+    pub fn to_u64(&self) -> Result<DenseDFA<Vec<u64>, u64>> {
+        self.to_sized()
+    }
+
+    /// Create a new DFA whose match semantics are equivalent to this DFA, but
+    /// attempt to use `A` for the representation of state identifiers. If `A`
+    /// is insufficient to represent all state identifiers in this DFA, then
+    /// this returns an error.
+    ///
+    /// An alternative way to construct such a DFA is to use
+    /// [`dense::Builder::build_with_size`](dense/struct.Builder.html#method.build_with_size).
+    /// In general, using the builder is preferred since it will use the given
+    /// state identifier representation throughout determinization (and
+    /// minimization, if done), and thereby using less memory throughout the
+    /// entire construction process. However, these routines are necessary
+    /// in cases where, say, a minimized DFA could fit in a smaller state
+    /// identifier representation, but the initial determinized DFA would not.
+    pub fn to_sized<A: StateID>(&self) -> Result<DenseDFA<Vec<A>, A>> {
+        self.repr().to_sized().map(|r| r.into_dense_dfa())
+    }
+
+    /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in little
+    /// endian format.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
+        self.repr().to_bytes::<LittleEndian>()
+    }
+
+    /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in big
+    /// endian format.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
+        self.repr().to_bytes::<BigEndian>()
+    }
+
+    /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in native
+    /// endian format. Generally, it is better to pick an explicit endianness
+    /// using either `to_bytes_little_endian` or `to_bytes_big_endian`. This
+    /// routine is useful in tests where the DFA is serialized and deserialized
+    /// on the same platform.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
+        self.repr().to_bytes::<NativeEndian>()
+    }
+}
+
+impl<'a, S: StateID> DenseDFA<&'a [S], S> {
+    /// Deserialize a DFA with a specific state identifier representation.
+    ///
+    /// Deserializing a DFA using this routine will never allocate heap memory.
+    /// This is also guaranteed to be a constant time operation that does not
+    /// vary with the size of the DFA.
+    ///
+    /// The bytes given should be generated by the serialization of a DFA with
+    /// either the
+    /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
+    /// method or the
+    /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
+    /// endian, depending on the endianness of the machine you are
+    /// deserializing this DFA from.
+    ///
+    /// If the state identifier representation is `usize`, then deserialization
+    /// is dependent on the pointer size. For this reason, it is best to
+    /// serialize DFAs using a fixed size representation for your state
+    /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
+    ///
+    /// # Panics
+    ///
+    /// The bytes given should be *trusted*. In particular, if the bytes
+    /// are not a valid serialization of a DFA, or if the given bytes are
+    /// not aligned to an 8 byte boundary, or if the endianness of the
+    /// serialized bytes is different than the endianness of the machine that
+    /// is deserializing the DFA, then this routine will panic. Moreover, it is
+    /// possible for this deserialization routine to succeed even if the given
+    /// bytes do not represent a valid serialized dense DFA.
+    ///
+    /// # Safety
+    ///
+    /// This routine is unsafe because it permits callers to provide an
+    /// arbitrary transition table with possibly incorrect transitions. While
+    /// the various serialization routines will never return an incorrect
+    /// transition table, there is no guarantee that the bytes provided here
+    /// are correct. While deserialization does many checks (as documented
+    /// above in the panic conditions), this routine does not check that the
+    /// transition table is correct. Given an incorrect transition table, it is
+    /// possible for the search routines to access out-of-bounds memory because
+    /// of explicit bounds check elision.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize a DFA to raw bytes, deserialize it
+    /// and then use it for searching. Note that we first convert the DFA to
+    /// using `u16` for its state identifier representation before serializing
+    /// it. While this isn't strictly necessary, it's good practice in order to
+    /// decrease the size of the DFA and to avoid platform specific pitfalls
+    /// such as differing pointer sizes.
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let initial = DenseDFA::new("foo[0-9]+")?;
+    /// let bytes = initial.to_u16()?.to_bytes_native_endian()?;
+    /// let dfa: DenseDFA<&[u16], u16> = unsafe {
+    ///     DenseDFA::from_bytes(&bytes)
+    /// };
+    ///
+    /// assert_eq!(Some(8), dfa.find(b"foo12345"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub unsafe fn from_bytes(buf: &'a [u8]) -> DenseDFA<&'a [S], S> {
+        Repr::from_bytes(buf).into_dense_dfa()
+    }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> DenseDFA<Vec<S>, S> {
+    /// Minimize this DFA in place.
+    ///
+    /// This is not part of the public API. It is only exposed to allow for
+    /// more granular external benchmarking.
+    #[doc(hidden)]
+    pub fn minimize(&mut self) {
+        self.repr_mut().minimize();
+    }
+
+    /// Return a mutable reference to the internal DFA representation.
+    fn repr_mut(&mut self) -> &mut Repr<Vec<S>, S> {
+        match *self {
+            DenseDFA::Standard(ref mut r) => &mut r.0,
+            DenseDFA::ByteClass(ref mut r) => &mut r.0,
+            DenseDFA::Premultiplied(ref mut r) => &mut r.0,
+            DenseDFA::PremultipliedByteClass(ref mut r) => &mut r.0,
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+impl<T: AsRef<[S]>, S: StateID> DFA for DenseDFA<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.repr().start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.repr().is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.repr().is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.repr().is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.repr().is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        match *self {
+            DenseDFA::Standard(ref r) => r.next_state(current, input),
+            DenseDFA::ByteClass(ref r) => r.next_state(current, input),
+            DenseDFA::Premultiplied(ref r) => r.next_state(current, input),
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                r.next_state(current, input)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        match *self {
+            DenseDFA::Standard(ref r) => {
+                r.next_state_unchecked(current, input)
+            }
+            DenseDFA::ByteClass(ref r) => {
+                r.next_state_unchecked(current, input)
+            }
+            DenseDFA::Premultiplied(ref r) => {
+                r.next_state_unchecked(current, input)
+            }
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                r.next_state_unchecked(current, input)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    // We specialize the following methods because it lets us lift the
+    // case analysis between the different types of dense DFAs. Instead of
+    // doing the case analysis for every transition, we do it once before
+    // searching.
+
+    #[inline]
+    fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
+        match *self {
+            DenseDFA::Standard(ref r) => r.is_match_at(bytes, start),
+            DenseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
+            DenseDFA::Premultiplied(ref r) => r.is_match_at(bytes, start),
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                r.is_match_at(bytes, start)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        match *self {
+            DenseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
+            DenseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
+            DenseDFA::Premultiplied(ref r) => {
+                r.shortest_match_at(bytes, start)
+            }
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                r.shortest_match_at(bytes, start)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        match *self {
+            DenseDFA::Standard(ref r) => r.find_at(bytes, start),
+            DenseDFA::ByteClass(ref r) => r.find_at(bytes, start),
+            DenseDFA::Premultiplied(ref r) => r.find_at(bytes, start),
+            DenseDFA::PremultipliedByteClass(ref r) => r.find_at(bytes, start),
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        match *self {
+            DenseDFA::Standard(ref r) => r.rfind_at(bytes, start),
+            DenseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
+            DenseDFA::Premultiplied(ref r) => r.rfind_at(bytes, start),
+            DenseDFA::PremultipliedByteClass(ref r) => {
+                r.rfind_at(bytes, start)
+            }
+            DenseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+/// A standard dense DFA that does not use premultiplication or byte classes.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct Standard<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for Standard<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.0.start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.0.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.0.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.0.is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.0.is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        let o = current.to_usize() * ALPHABET_LEN + input as usize;
+        self.0.trans()[o]
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        let o = current.to_usize() * ALPHABET_LEN + input as usize;
+        *self.0.trans().get_unchecked(o)
+    }
+}
+
+/// A dense DFA that shrinks its alphabet.
+///
+/// Alphabet shrinking is achieved by using a set of equivalence classes
+/// instead of using all possible byte values. Any two bytes belong to the same
+/// equivalence class if and only if they can be used interchangeably anywhere
+/// in the DFA while never discriminating between a match and a non-match.
+///
+/// This type of DFA can result in significant space reduction with a very
+/// small match time performance penalty.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct ByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for ByteClass<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.0.start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.0.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.0.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.0.is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.0.is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        let input = self.0.byte_classes().get(input);
+        let o = current.to_usize() * self.0.alphabet_len() + input as usize;
+        self.0.trans()[o]
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        let input = self.0.byte_classes().get_unchecked(input);
+        let o = current.to_usize() * self.0.alphabet_len() + input as usize;
+        *self.0.trans().get_unchecked(o)
+    }
+}
+
+/// A dense DFA that premultiplies all of its state identifiers in its
+/// transition table.
+///
+/// This saves an instruction per byte at match time which improves search
+/// performance.
+///
+/// The only downside of premultiplication is that it may prevent one from
+/// using a smaller state identifier representation than you otherwise could.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct Premultiplied<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for Premultiplied<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.0.start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.0.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.0.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.0.is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.0.is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        let o = current.to_usize() + input as usize;
+        self.0.trans()[o]
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        let o = current.to_usize() + input as usize;
+        *self.0.trans().get_unchecked(o)
+    }
+}
+
+/// The default configuration of a dense DFA, which uses byte classes and
+/// premultiplies its state identifiers.
+///
+/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
+/// can be used for searching directly. One possible reason why one might want
+/// to use this type directly is if you are implementing your own search
+/// routines by walking a DFA's transitions directly. In that case, you'll want
+/// to use this type (or any of the other DFA variant types) directly, since
+/// they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct PremultipliedByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
+
+impl<T: AsRef<[S]>, S: StateID> DFA for PremultipliedByteClass<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.0.start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.0.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.0.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.0.is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.0.is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        let input = self.0.byte_classes().get(input);
+        let o = current.to_usize() + input as usize;
+        self.0.trans()[o]
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        let input = self.0.byte_classes().get_unchecked(input);
+        let o = current.to_usize() + input as usize;
+        *self.0.trans().get_unchecked(o)
+    }
+}
+
+/// The internal representation of a dense DFA.
+///
+/// This representation is shared by all DFA variants.
+#[derive(Clone)]
+#[cfg_attr(not(feature = "std"), derive(Debug))]
+pub(crate) struct Repr<T, S> {
+    /// Whether the state identifiers in the transition table have been
+    /// premultiplied or not.
+    ///
+    /// Premultiplied identifiers means that instead of your matching loop
+    /// looking something like this:
+    ///
+    ///   state = dfa.start
+    ///   for byte in haystack:
+    ///       next = dfa.transitions[state * len(alphabet) + byte]
+    ///       if dfa.is_match(next):
+    ///           return true
+    ///   return false
+    ///
+    /// it can instead look like this:
+    ///
+    ///   state = dfa.start
+    ///   for byte in haystack:
+    ///       next = dfa.transitions[state + byte]
+    ///       if dfa.is_match(next):
+    ///           return true
+    ///   return false
+    ///
+    /// In other words, we save a multiplication instruction in the critical
+    /// path. This turns out to be a decent performance win. The cost of using
+    /// premultiplied state ids is that they can require a bigger state id
+    /// representation.
+    premultiplied: bool,
+    /// Whether this DFA can only match at the beginning of input or not.
+    ///
+    /// When true, a match should only be reported if it begins at the 0th
+    /// index of the haystack.
+    anchored: bool,
+    /// The initial start state ID.
+    start: S,
+    /// The total number of states in this DFA. Note that a DFA always has at
+    /// least one state---the dead state---even the empty DFA. In particular,
+    /// the dead state always has ID 0 and is correspondingly always the first
+    /// state. The dead state is never a match state.
+    state_count: usize,
+    /// States in a DFA have a *partial* ordering such that a match state
+    /// always precedes any non-match state (except for the special dead
+    /// state).
+    ///
+    /// `max_match` corresponds to the last state that is a match state. This
+    /// encoding has two critical benefits. Firstly, we are not required to
+    /// store any additional per-state information about whether it is a match
+    /// state or not. Secondly, when searching with the DFA, we can do a single
+    /// comparison with `max_match` for each byte instead of two comparisons
+    /// for each byte (one testing whether it is a match and the other testing
+    /// whether we've reached a dead state). Namely, to determine the status
+    /// of the next state, we can do this:
+    ///
+    ///   next_state = transition[cur_state * alphabet_len + cur_byte]
+    ///   if next_state <= max_match:
+    ///       // next_state is either dead (no-match) or a match
+    ///       return next_state != dead
+    max_match: S,
+    /// A set of equivalence classes, where a single equivalence class
+    /// represents a set of bytes that never discriminate between a match
+    /// and a non-match in the DFA. Each equivalence class corresponds to
+    /// a single letter in this DFA's alphabet, where the maximum number of
+    /// letters is 256 (each possible value of a byte). Consequently, the
+    /// number of equivalence classes corresponds to the number of transitions
+    /// for each DFA state.
+    ///
+    /// The only time the number of equivalence classes is fewer than 256 is
+    /// if the DFA's kind uses byte classes. If the DFA doesn't use byte
+    /// classes, then this vector is empty.
+    byte_classes: ByteClasses,
+    /// A contiguous region of memory representing the transition table in
+    /// row-major order. The representation is dense. That is, every state has
+    /// precisely the same number of transitions. The maximum number of
+    /// transitions is 256. If a DFA has been instructed to use byte classes,
+    /// then the number of transitions can be much less.
+    ///
+    /// In practice, T is either Vec<S> or &[S].
+    trans: T,
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> Repr<Vec<S>, S> {
+    /// Create a new empty DFA with singleton byte classes (every byte is its
+    /// own equivalence class).
+    pub fn empty() -> Repr<Vec<S>, S> {
+        Repr::empty_with_byte_classes(ByteClasses::singletons())
+    }
+
+    /// Create a new empty DFA with the given set of byte equivalence classes.
+    /// An empty DFA never matches any input.
+    pub fn empty_with_byte_classes(
+        byte_classes: ByteClasses,
+    ) -> Repr<Vec<S>, S> {
+        let mut dfa = Repr {
+            premultiplied: false,
+            anchored: true,
+            start: dead_id(),
+            state_count: 0,
+            max_match: S::from_usize(0),
+            byte_classes,
+            trans: vec![],
+        };
+        // Every state ID repr must be able to fit at least one state.
+        dfa.add_empty_state().unwrap();
+        dfa
+    }
+
+    /// Sets whether this DFA is anchored or not.
+    pub fn anchored(mut self, yes: bool) -> Repr<Vec<S>, S> {
+        self.anchored = yes;
+        self
+    }
+}
+
+impl<T: AsRef<[S]>, S: StateID> Repr<T, S> {
+    /// Convert this internal DFA representation to a DenseDFA based on its
+    /// transition table access pattern.
+    pub fn into_dense_dfa(self) -> DenseDFA<T, S> {
+        match (self.premultiplied, self.byte_classes().is_singleton()) {
+            // no premultiplication, no byte classes
+            (false, true) => DenseDFA::Standard(Standard(self)),
+            // no premultiplication, yes byte classes
+            (false, false) => DenseDFA::ByteClass(ByteClass(self)),
+            // yes premultiplication, no byte classes
+            (true, true) => DenseDFA::Premultiplied(Premultiplied(self)),
+            // yes premultiplication, yes byte classes
+            (true, false) => {
+                DenseDFA::PremultipliedByteClass(PremultipliedByteClass(self))
+            }
+        }
+    }
+
+    fn as_ref<'a>(&'a self) -> Repr<&'a [S], S> {
+        Repr {
+            premultiplied: self.premultiplied,
+            anchored: self.anchored,
+            start: self.start,
+            state_count: self.state_count,
+            max_match: self.max_match,
+            byte_classes: self.byte_classes().clone(),
+            trans: self.trans(),
+        }
+    }
+
+    #[cfg(feature = "std")]
+    fn to_owned(&self) -> Repr<Vec<S>, S> {
+        Repr {
+            premultiplied: self.premultiplied,
+            anchored: self.anchored,
+            start: self.start,
+            state_count: self.state_count,
+            max_match: self.max_match,
+            byte_classes: self.byte_classes().clone(),
+            trans: self.trans().to_vec(),
+        }
+    }
+
+    /// Return the starting state of this DFA.
+    ///
+    /// All searches using this DFA must begin at this state. There is exactly
+    /// one starting state for every DFA. A starting state may be a dead state
+    /// or a matching state or neither.
+    pub fn start_state(&self) -> S {
+        self.start
+    }
+
+    /// Returns true if and only if the given identifier corresponds to a match
+    /// state.
+    pub fn is_match_state(&self, id: S) -> bool {
+        id <= self.max_match && id != dead_id()
+    }
+
+    /// Returns true if and only if the given identifier corresponds to a dead
+    /// state.
+    pub fn is_dead_state(&self, id: S) -> bool {
+        id == dead_id()
+    }
+
+    /// Returns true if and only if the given identifier could correspond to
+    /// either a match state or a dead state. If this returns false, then the
+    /// given identifier does not correspond to either a match state or a dead
+    /// state.
+    pub fn is_match_or_dead_state(&self, id: S) -> bool {
+        id <= self.max_match_state()
+    }
+
+    /// Returns the maximum identifier for which a match state can exist.
+    ///
+    /// More specifically, the return identifier always corresponds to either
+    /// a match state or a dead state. Namely, either
+    /// `is_match_state(returned)` or `is_dead_state(returned)` is guaranteed
+    /// to be true.
+    pub fn max_match_state(&self) -> S {
+        self.max_match
+    }
+
+    /// Returns true if and only if this DFA is anchored.
+    pub fn is_anchored(&self) -> bool {
+        self.anchored
+    }
+
+    /// Return the byte classes used by this DFA.
+    pub fn byte_classes(&self) -> &ByteClasses {
+        &self.byte_classes
+    }
+
+    /// Returns an iterator over all states in this DFA.
+    ///
+    /// This iterator yields a tuple for each state. The first element of the
+    /// tuple corresponds to a state's identifier, and the second element
+    /// corresponds to the state itself (comprised of its transitions).
+    ///
+    /// If this DFA is premultiplied, then the state identifiers are in
+    /// turn premultiplied as well, making them usable without additional
+    /// modification.
+    #[cfg(feature = "std")]
+    pub fn states(&self) -> StateIter<T, S> {
+        let it = self.trans().chunks(self.alphabet_len());
+        StateIter { dfa: self, it: it.enumerate() }
+    }
+
+    /// Return the total number of states in this DFA. Every DFA has at least
+    /// 1 state, even the empty DFA.
+    #[cfg(feature = "std")]
+    pub fn state_count(&self) -> usize {
+        self.state_count
+    }
+
+    /// Return the number of elements in this DFA's alphabet.
+    ///
+    /// If this DFA doesn't use byte classes, then this is always equivalent
+    /// to 256. Otherwise, it is guaranteed to be some value less than or equal
+    /// to 256.
+    pub fn alphabet_len(&self) -> usize {
+        self.byte_classes().alphabet_len()
+    }
+
+    /// Returns the memory usage, in bytes, of this DFA.
+    pub fn memory_usage(&self) -> usize {
+        self.trans().len() * mem::size_of::<S>()
+    }
+
+    /// Convert the given state identifier to the state's index. The state's
+    /// index corresponds to the position in which it appears in the transition
+    /// table. When a DFA is NOT premultiplied, then a state's identifier is
+    /// also its index. When a DFA is premultiplied, then a state's identifier
+    /// is equal to `index * alphabet_len`. This routine reverses that.
+    #[cfg(feature = "std")]
+    pub fn state_id_to_index(&self, id: S) -> usize {
+        if self.premultiplied {
+            id.to_usize() / self.alphabet_len()
+        } else {
+            id.to_usize()
+        }
+    }
+
+    /// Return this DFA's transition table as a slice.
+    fn trans(&self) -> &[S] {
+        self.trans.as_ref()
+    }
+
+    /// Create a sparse DFA from the internal representation of a dense DFA.
+    #[cfg(feature = "std")]
+    pub fn to_sparse_sized<A: StateID>(
+        &self,
+    ) -> Result<SparseDFA<Vec<u8>, A>> {
+        SparseDFA::from_dense_sized(self)
+    }
+
+    /// Create a new DFA whose match semantics are equivalent to this DFA, but
+    /// attempt to use `A` for the representation of state identifiers. If `A`
+    /// is insufficient to represent all state identifiers in this DFA, then
+    /// this returns an error.
+    #[cfg(feature = "std")]
+    pub fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<A>, A>> {
+        // Check that this DFA can fit into A's representation.
+        let mut last_state_id = self.state_count - 1;
+        if self.premultiplied {
+            last_state_id *= self.alphabet_len();
+        }
+        if last_state_id > A::max_id() {
+            return Err(Error::state_id_overflow(A::max_id()));
+        }
+
+        // We're off to the races. The new DFA is the same as the old one,
+        // but its transition table is truncated.
+        let mut new = Repr {
+            premultiplied: self.premultiplied,
+            anchored: self.anchored,
+            start: A::from_usize(self.start.to_usize()),
+            state_count: self.state_count,
+            max_match: A::from_usize(self.max_match.to_usize()),
+            byte_classes: self.byte_classes().clone(),
+            trans: vec![dead_id::<A>(); self.trans().len()],
+        };
+        for (i, id) in new.trans.iter_mut().enumerate() {
+            *id = A::from_usize(self.trans()[i].to_usize());
+        }
+        Ok(new)
+    }
+
+    /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    #[cfg(feature = "std")]
+    pub(crate) fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
+        let label = b"rust-regex-automata-dfa\x00";
+        assert_eq!(24, label.len());
+
+        let trans_size = mem::size_of::<S>() * self.trans().len();
+        let size =
+            // For human readable label.
+            label.len()
+            // endiannes check, must be equal to 0xFEFF for native endian
+            + 2
+            // For version number.
+            + 2
+            // Size of state ID representation, in bytes.
+            // Must be 1, 2, 4 or 8.
+            + 2
+            // For DFA misc options.
+            + 2
+            // For start state.
+            + 8
+            // For state count.
+            + 8
+            // For max match state.
+            + 8
+            // For byte class map.
+            + 256
+            // For transition table.
+            + trans_size;
+        // sanity check, this can be updated if need be
+        assert_eq!(312 + trans_size, size);
+        // This must always pass. It checks that the transition table is at
+        // a properly aligned address.
+        assert_eq!(0, (size - trans_size) % 8);
+
+        let mut buf = vec![0; size];
+        let mut i = 0;
+
+        // write label
+        for &b in label {
+            buf[i] = b;
+            i += 1;
+        }
+        // endianness check
+        A::write_u16(&mut buf[i..], 0xFEFF);
+        i += 2;
+        // version number
+        A::write_u16(&mut buf[i..], 1);
+        i += 2;
+        // size of state ID
+        let state_size = mem::size_of::<S>();
+        if ![1, 2, 4, 8].contains(&state_size) {
+            return Err(Error::serialize(&format!(
+                "state size of {} not supported, must be 1, 2, 4 or 8",
+                state_size
+            )));
+        }
+        A::write_u16(&mut buf[i..], state_size as u16);
+        i += 2;
+        // DFA misc options
+        let mut options = 0u16;
+        if self.premultiplied {
+            options |= MASK_PREMULTIPLIED;
+        }
+        if self.anchored {
+            options |= MASK_ANCHORED;
+        }
+        A::write_u16(&mut buf[i..], options);
+        i += 2;
+        // start state
+        A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
+        i += 8;
+        // state count
+        A::write_u64(&mut buf[i..], self.state_count as u64);
+        i += 8;
+        // max match state
+        A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
+        i += 8;
+        // byte class map
+        for b in (0..256).map(|b| b as u8) {
+            buf[i] = self.byte_classes().get(b);
+            i += 1;
+        }
+        // transition table
+        for &id in self.trans() {
+            write_state_id_bytes::<A, _>(&mut buf[i..], id);
+            i += state_size;
+        }
+        assert_eq!(size, i, "expected to consume entire buffer");
+
+        Ok(buf)
+    }
+}
+
+impl<'a, S: StateID> Repr<&'a [S], S> {
+    /// The implementation for deserializing a DFA from raw bytes.
+    unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [S], S> {
+        assert_eq!(
+            0,
+            buf.as_ptr() as usize % mem::align_of::<S>(),
+            "DenseDFA starting at address {} is not aligned to {} bytes",
+            buf.as_ptr() as usize,
+            mem::align_of::<S>()
+        );
+
+        // skip over label
+        match buf.iter().position(|&b| b == b'\x00') {
+            None => panic!("could not find label"),
+            Some(i) => buf = &buf[i + 1..],
+        }
+
+        // check that current endianness is same as endianness of DFA
+        let endian_check = NativeEndian::read_u16(buf);
+        buf = &buf[2..];
+        if endian_check != 0xFEFF {
+            panic!(
+                "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
+                 are you trying to load a DenseDFA serialized with a \
+                 different endianness?",
+                endian_check,
+            );
+        }
+
+        // check that the version number is supported
+        let version = NativeEndian::read_u16(buf);
+        buf = &buf[2..];
+        if version != 1 {
+            panic!(
+                "expected version 1, but found unsupported version {}",
+                version,
+            );
+        }
+
+        // read size of state
+        let state_size = NativeEndian::read_u16(buf) as usize;
+        if state_size != mem::size_of::<S>() {
+            panic!(
+                "state size of DenseDFA ({}) does not match \
+                 requested state size ({})",
+                state_size,
+                mem::size_of::<S>(),
+            );
+        }
+        buf = &buf[2..];
+
+        // read miscellaneous options
+        let opts = NativeEndian::read_u16(buf);
+        buf = &buf[2..];
+
+        // read start state
+        let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
+        buf = &buf[8..];
+
+        // read state count
+        let state_count = NativeEndian::read_u64(buf) as usize;
+        buf = &buf[8..];
+
+        // read max match state
+        let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
+        buf = &buf[8..];
+
+        // read byte classes
+        let byte_classes = ByteClasses::from_slice(&buf[..256]);
+        buf = &buf[256..];
+
+        let len = state_count * byte_classes.alphabet_len();
+        let len_bytes = len * state_size;
+        assert!(
+            buf.len() <= len_bytes,
+            "insufficient transition table bytes, \
+             expected at least {} but only have {}",
+            len_bytes,
+            buf.len()
+        );
+        assert_eq!(
+            0,
+            buf.as_ptr() as usize % mem::align_of::<S>(),
+            "DenseDFA transition table is not properly aligned"
+        );
+
+        // SAFETY: This is the only actual not-safe thing in this entire
+        // routine. The key things we need to worry about here are alignment
+        // and size. The two asserts above should cover both conditions.
+        let trans = slice::from_raw_parts(buf.as_ptr() as *const S, len);
+        Repr {
+            premultiplied: opts & MASK_PREMULTIPLIED > 0,
+            anchored: opts & MASK_ANCHORED > 0,
+            start,
+            state_count,
+            max_match,
+            byte_classes,
+            trans,
+        }
+    }
+}
+
+/// The following methods implement mutable routines on the internal
+/// representation of a DFA. As such, we must fix the first type parameter to
+/// a `Vec<S>` since a generic `T: AsRef<[S]>` does not permit mutation. We
+/// can get away with this because these methods are internal to the crate and
+/// are exclusively used during construction of the DFA.
+#[cfg(feature = "std")]
+impl<S: StateID> Repr<Vec<S>, S> {
+    pub fn premultiply(&mut self) -> Result<()> {
+        if self.premultiplied || self.state_count <= 1 {
+            return Ok(());
+        }
+
+        let alpha_len = self.alphabet_len();
+        premultiply_overflow_error(
+            S::from_usize(self.state_count - 1),
+            alpha_len,
+        )?;
+
+        for id in (0..self.state_count).map(S::from_usize) {
+            for (_, next) in self.get_state_mut(id).iter_mut() {
+                *next = S::from_usize(next.to_usize() * alpha_len);
+            }
+        }
+        self.premultiplied = true;
+        self.start = S::from_usize(self.start.to_usize() * alpha_len);
+        self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
+        Ok(())
+    }
+
+    /// Minimize this DFA using Hopcroft's algorithm.
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn minimize(&mut self) {
+        assert!(!self.premultiplied, "can't minimize premultiplied DFA");
+
+        Minimizer::new(self).run();
+    }
+
+    /// Set the start state of this DFA.
+    ///
+    /// Note that a start state cannot be set on a premultiplied DFA. Instead,
+    /// DFAs should first be completely constructed and then premultiplied.
+    pub fn set_start_state(&mut self, start: S) {
+        assert!(!self.premultiplied, "can't set start on premultiplied DFA");
+        assert!(start.to_usize() < self.state_count, "invalid start state");
+
+        self.start = start;
+    }
+
+    /// Set the maximum state identifier that could possible correspond to a
+    /// match state.
+    ///
+    /// Callers must uphold the invariant that any state identifier less than
+    /// or equal to the identifier given is either a match state or the special
+    /// dead state (which always has identifier 0 and whose transitions all
+    /// lead back to itself).
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn set_max_match_state(&mut self, id: S) {
+        assert!(!self.premultiplied, "can't set match on premultiplied DFA");
+        assert!(id.to_usize() < self.state_count, "invalid max match state");
+
+        self.max_match = id;
+    }
+
+    /// Add the given transition to this DFA. Both the `from` and `to` states
+    /// must already exist.
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn add_transition(&mut self, from: S, byte: u8, to: S) {
+        assert!(!self.premultiplied, "can't add trans to premultiplied DFA");
+        assert!(from.to_usize() < self.state_count, "invalid from state");
+        assert!(to.to_usize() < self.state_count, "invalid to state");
+
+        let class = self.byte_classes().get(byte);
+        let offset = from.to_usize() * self.alphabet_len() + class as usize;
+        self.trans[offset] = to;
+    }
+
+    /// An an empty state (a state where all transitions lead to a dead state)
+    /// and return its identifier. The identifier returned is guaranteed to
+    /// not point to any other existing state.
+    ///
+    /// If adding a state would exhaust the state identifier space (given by
+    /// `S`), then this returns an error. In practice, this means that the
+    /// state identifier representation chosen is too small.
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn add_empty_state(&mut self) -> Result<S> {
+        assert!(!self.premultiplied, "can't add state to premultiplied DFA");
+
+        let id = if self.state_count == 0 {
+            S::from_usize(0)
+        } else {
+            next_state_id(S::from_usize(self.state_count - 1))?
+        };
+        let alphabet_len = self.alphabet_len();
+        self.trans.extend(iter::repeat(dead_id::<S>()).take(alphabet_len));
+        // This should never panic, since state_count is a usize. The
+        // transition table size would have run out of room long ago.
+        self.state_count = self.state_count.checked_add(1).unwrap();
+        Ok(id)
+    }
+
+    /// Return a mutable representation of the state corresponding to the given
+    /// id. This is useful for implementing routines that manipulate DFA states
+    /// (e.g., swapping states).
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn get_state_mut(&mut self, id: S) -> StateMut<S> {
+        assert!(!self.premultiplied, "can't get state in premultiplied DFA");
+
+        let alphabet_len = self.alphabet_len();
+        let offset = id.to_usize() * alphabet_len;
+        StateMut {
+            transitions: &mut self.trans[offset..offset + alphabet_len],
+        }
+    }
+
+    /// Swap the two states given in the transition table.
+    ///
+    /// This routine does not do anything to check the correctness of this
+    /// swap. Callers must ensure that other states pointing to id1 and id2 are
+    /// updated appropriately.
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn swap_states(&mut self, id1: S, id2: S) {
+        assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
+
+        let o1 = id1.to_usize() * self.alphabet_len();
+        let o2 = id2.to_usize() * self.alphabet_len();
+        for b in 0..self.alphabet_len() {
+            self.trans.swap(o1 + b, o2 + b);
+        }
+    }
+
+    /// Truncate the states in this DFA to the given count.
+    ///
+    /// This routine does not do anything to check the correctness of this
+    /// truncation. Callers must ensure that other states pointing to truncated
+    /// states are updated appropriately.
+    ///
+    /// This cannot be called on a premultiplied DFA.
+    pub fn truncate_states(&mut self, count: usize) {
+        assert!(!self.premultiplied, "can't truncate in premultiplied DFA");
+
+        let alphabet_len = self.alphabet_len();
+        self.trans.truncate(count * alphabet_len);
+        self.state_count = count;
+    }
+
+    /// This routine shuffles all match states in this DFA---according to the
+    /// given map---to the beginning of the DFA such that every non-match state
+    /// appears after every match state. (With one exception: the special dead
+    /// state remains as the first state.) The given map should have length
+    /// exactly equivalent to the number of states in this DFA.
+    ///
+    /// The purpose of doing this shuffling is to avoid the need to store
+    /// additional state to determine whether a state is a match state or not.
+    /// It also enables a single conditional in the core matching loop instead
+    /// of two.
+    ///
+    /// This updates `self.max_match` to point to the last matching state as
+    /// well as `self.start` if the starting state was moved.
+    pub fn shuffle_match_states(&mut self, is_match: &[bool]) {
+        assert!(
+            !self.premultiplied,
+            "cannot shuffle match states of premultiplied DFA"
+        );
+        assert_eq!(self.state_count, is_match.len());
+
+        if self.state_count <= 1 {
+            return;
+        }
+
+        let mut first_non_match = 1;
+        while first_non_match < self.state_count && is_match[first_non_match] {
+            first_non_match += 1;
+        }
+
+        let mut swaps: Vec<S> = vec![dead_id(); self.state_count];
+        let mut cur = self.state_count - 1;
+        while cur > first_non_match {
+            if is_match[cur] {
+                self.swap_states(
+                    S::from_usize(cur),
+                    S::from_usize(first_non_match),
+                );
+                swaps[cur] = S::from_usize(first_non_match);
+                swaps[first_non_match] = S::from_usize(cur);
+
+                first_non_match += 1;
+                while first_non_match < cur && is_match[first_non_match] {
+                    first_non_match += 1;
+                }
+            }
+            cur -= 1;
+        }
+        for id in (0..self.state_count).map(S::from_usize) {
+            for (_, next) in self.get_state_mut(id).iter_mut() {
+                if swaps[next.to_usize()] != dead_id() {
+                    *next = swaps[next.to_usize()];
+                }
+            }
+        }
+        if swaps[self.start.to_usize()] != dead_id() {
+            self.start = swaps[self.start.to_usize()];
+        }
+        self.max_match = S::from_usize(first_non_match - 1);
+    }
+}
+
+#[cfg(feature = "std")]
+impl<T: AsRef<[S]>, S: StateID> fmt::Debug for Repr<T, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fn state_status<T: AsRef<[S]>, S: StateID>(
+            dfa: &Repr<T, S>,
+            id: S,
+        ) -> &'static str {
+            if id == dead_id() {
+                if dfa.is_match_state(id) {
+                    "D*"
+                } else {
+                    "D "
+                }
+            } else if id == dfa.start_state() {
+                if dfa.is_match_state(id) {
+                    ">*"
+                } else {
+                    "> "
+                }
+            } else {
+                if dfa.is_match_state(id) {
+                    " *"
+                } else {
+                    "  "
+                }
+            }
+        }
+
+        writeln!(f, "DenseDFA(")?;
+        for (id, state) in self.states() {
+            let status = state_status(self, id);
+            writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
+        }
+        writeln!(f, ")")?;
+        Ok(())
+    }
+}
+
+/// An iterator over all states in a DFA.
+///
+/// This iterator yields a tuple for each state. The first element of the
+/// tuple corresponds to a state's identifier, and the second element
+/// corresponds to the state itself (comprised of its transitions).
+///
+/// If this DFA is premultiplied, then the state identifiers are in turn
+/// premultiplied as well, making them usable without additional modification.
+///
+/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
+/// the type of the transition table itself and `S` corresponds to the state
+/// identifier representation.
+#[cfg(feature = "std")]
+pub(crate) struct StateIter<'a, T: 'a, S: 'a> {
+    dfa: &'a Repr<T, S>,
+    it: iter::Enumerate<slice::Chunks<'a, S>>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, T: AsRef<[S]>, S: StateID> Iterator for StateIter<'a, T, S> {
+    type Item = (S, State<'a, S>);
+
+    fn next(&mut self) -> Option<(S, State<'a, S>)> {
+        self.it.next().map(|(id, chunk)| {
+            let state = State { transitions: chunk };
+            let id = if self.dfa.premultiplied {
+                id * self.dfa.alphabet_len()
+            } else {
+                id
+            };
+            (S::from_usize(id), state)
+        })
+    }
+}
+
+/// An immutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table and `S`
+/// corresponds to the state identifier representation.
+#[cfg(feature = "std")]
+pub(crate) struct State<'a, S: 'a> {
+    transitions: &'a [S],
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> State<'a, S> {
+    /// Return an iterator over all transitions in this state. This yields
+    /// a number of transitions equivalent to the alphabet length of the
+    /// corresponding DFA.
+    ///
+    /// Each transition is represented by a tuple. The first element is
+    /// the input byte for that transition and the second element is the
+    /// transitions itself.
+    pub fn transitions(&self) -> StateTransitionIter<S> {
+        StateTransitionIter { it: self.transitions.iter().enumerate() }
+    }
+
+    /// Return an iterator over a sparse representation of the transitions in
+    /// this state. Only non-dead transitions are returned.
+    ///
+    /// The "sparse" representation in this case corresponds to a sequence of
+    /// triples. The first two elements of the triple comprise an inclusive
+    /// byte range while the last element corresponds to the transition taken
+    /// for all bytes in the range.
+    ///
+    /// This is somewhat more condensed than the classical sparse
+    /// representation (where you have an element for every non-dead
+    /// transition), but in practice, checking if a byte is in a range is very
+    /// cheap and using ranges tends to conserve quite a bit more space.
+    pub fn sparse_transitions(&self) -> StateSparseTransitionIter<S> {
+        StateSparseTransitionIter { dense: self.transitions(), cur: None }
+    }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for State<'a, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut transitions = vec![];
+        for (start, end, next_id) in self.sparse_transitions() {
+            let line = if start == end {
+                format!("{} => {}", escape(start), next_id.to_usize())
+            } else {
+                format!(
+                    "{}-{} => {}",
+                    escape(start),
+                    escape(end),
+                    next_id.to_usize(),
+                )
+            };
+            transitions.push(line);
+        }
+        write!(f, "{}", transitions.join(", "))?;
+        Ok(())
+    }
+}
+
+/// An iterator over all transitions in a single DFA state. This yields
+/// a number of transitions equivalent to the alphabet length of the
+/// corresponding DFA.
+///
+/// Each transition is represented by a tuple. The first element is the input
+/// byte for that transition and the second element is the transitions itself.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub(crate) struct StateTransitionIter<'a, S: 'a> {
+    it: iter::Enumerate<slice::Iter<'a, S>>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> Iterator for StateTransitionIter<'a, S> {
+    type Item = (u8, S);
+
+    fn next(&mut self) -> Option<(u8, S)> {
+        self.it.next().map(|(i, &id)| (i as u8, id))
+    }
+}
+
+/// An iterator over all transitions in a single DFA state using a sparse
+/// representation.
+///
+/// Each transition is represented by a triple. The first two elements of the
+/// triple comprise an inclusive byte range while the last element corresponds
+/// to the transition taken for all bytes in the range.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub(crate) struct StateSparseTransitionIter<'a, S: 'a> {
+    dense: StateTransitionIter<'a, S>,
+    cur: Option<(u8, u8, S)>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> Iterator for StateSparseTransitionIter<'a, S> {
+    type Item = (u8, u8, S);
+
+    fn next(&mut self) -> Option<(u8, u8, S)> {
+        while let Some((b, next)) = self.dense.next() {
+            let (prev_start, prev_end, prev_next) = match self.cur {
+                Some(t) => t,
+                None => {
+                    self.cur = Some((b, b, next));
+                    continue;
+                }
+            };
+            if prev_next == next {
+                self.cur = Some((prev_start, b, prev_next));
+            } else {
+                self.cur = Some((b, b, next));
+                if prev_next != dead_id() {
+                    return Some((prev_start, prev_end, prev_next));
+                }
+            }
+        }
+        if let Some((start, end, next)) = self.cur.take() {
+            if next != dead_id() {
+                return Some((start, end, next));
+            }
+        }
+        None
+    }
+}
+
+/// A mutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table and `S`
+/// corresponds to the state identifier representation.
+#[cfg(feature = "std")]
+pub(crate) struct StateMut<'a, S: 'a> {
+    transitions: &'a mut [S],
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> StateMut<'a, S> {
+    /// Return an iterator over all transitions in this state. This yields
+    /// a number of transitions equivalent to the alphabet length of the
+    /// corresponding DFA.
+    ///
+    /// Each transition is represented by a tuple. The first element is the
+    /// input byte for that transition and the second element is a mutable
+    /// reference to the transition itself.
+    pub fn iter_mut(&mut self) -> StateTransitionIterMut<S> {
+        StateTransitionIterMut { it: self.transitions.iter_mut().enumerate() }
+    }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Debug::fmt(&State { transitions: self.transitions }, f)
+    }
+}
+
+/// A mutable iterator over all transitions in a DFA state.
+///
+/// Each transition is represented by a tuple. The first element is the
+/// input byte for that transition and the second element is a mutable
+/// reference to the transition itself.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+pub(crate) struct StateTransitionIterMut<'a, S: 'a> {
+    it: iter::Enumerate<slice::IterMut<'a, S>>,
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> Iterator for StateTransitionIterMut<'a, S> {
+    type Item = (u8, &'a mut S);
+
+    fn next(&mut self) -> Option<(u8, &'a mut S)> {
+        self.it.next().map(|(i, id)| (i as u8, id))
+    }
+}
+
+/// A builder for constructing a deterministic finite automaton from regular
+/// expressions.
+///
+/// This builder permits configuring several aspects of the construction
+/// process such as case insensitivity, Unicode support and various options
+/// that impact the size of the generated DFA. In some cases, options (like
+/// performing DFA minimization) can come with a substantial additional cost.
+///
+/// This builder always constructs a *single* DFA. As such, this builder can
+/// only be used to construct regexes that either detect the presence of a
+/// match or find the end location of a match. A single DFA cannot produce both
+/// the start and end of a match. For that information, use a
+/// [`Regex`](struct.Regex.html), which can be similarly configured using
+/// [`RegexBuilder`](struct.RegexBuilder.html).
+#[cfg(feature = "std")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+    parser: ParserBuilder,
+    nfa: nfa::Builder,
+    anchored: bool,
+    minimize: bool,
+    premultiply: bool,
+    byte_classes: bool,
+    reverse: bool,
+    longest_match: bool,
+}
+
+#[cfg(feature = "std")]
+impl Builder {
+    /// Create a new DenseDFA builder with the default configuration.
+    pub fn new() -> Builder {
+        let mut nfa = nfa::Builder::new();
+        // This is enabled by default, but we set it here anyway. Since we're
+        // building a DFA, shrinking the NFA is always a good idea.
+        nfa.shrink(true);
+        Builder {
+            parser: ParserBuilder::new(),
+            nfa,
+            anchored: false,
+            minimize: false,
+            premultiply: true,
+            byte_classes: true,
+            reverse: false,
+            longest_match: false,
+        }
+    }
+
+    /// Build a DFA from the given pattern.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build(&self, pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
+        self.build_with_size::<usize>(pattern)
+    }
+
+    /// Build a DFA from the given pattern using a specific representation for
+    /// the DFA's state IDs.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    ///
+    /// The representation of state IDs is determined by the `S` type
+    /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
+    /// or `usize`, where `usize` is the default used for `build`. The purpose
+    /// of specifying a representation for state IDs is to reduce the memory
+    /// footprint of a DFA.
+    ///
+    /// When using this routine, the chosen state ID representation will be
+    /// used throughout determinization and minimization, if minimization
+    /// was requested. Even if the minimized DFA can fit into the chosen
+    /// state ID representation but the initial determinized DFA cannot,
+    /// then this will still return an error. To get a minimized DFA with a
+    /// smaller state ID representation, first build it with a bigger state ID
+    /// representation, and then shrink the size of the DFA using one of its
+    /// conversion routines, such as
+    /// [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
+    pub fn build_with_size<S: StateID>(
+        &self,
+        pattern: &str,
+    ) -> Result<DenseDFA<Vec<S>, S>> {
+        self.build_from_nfa(&self.build_nfa(pattern)?)
+    }
+
+    /// An internal only (for now) API for building a dense DFA directly from
+    /// an NFA.
+    pub(crate) fn build_from_nfa<S: StateID>(
+        &self,
+        nfa: &NFA,
+    ) -> Result<DenseDFA<Vec<S>, S>> {
+        if self.longest_match && !self.anchored {
+            return Err(Error::unsupported_longest_match());
+        }
+
+        let mut dfa = if self.byte_classes {
+            Determinizer::new(nfa)
+                .with_byte_classes()
+                .longest_match(self.longest_match)
+                .build()
+        } else {
+            Determinizer::new(nfa).longest_match(self.longest_match).build()
+        }?;
+        if self.minimize {
+            dfa.minimize();
+        }
+        if self.premultiply {
+            dfa.premultiply()?;
+        }
+        Ok(dfa.into_dense_dfa())
+    }
+
+    /// Builds an NFA from the given pattern.
+    pub(crate) fn build_nfa(&self, pattern: &str) -> Result<NFA> {
+        let hir = self.parser.build().parse(pattern).map_err(Error::syntax)?;
+        Ok(self.nfa.build(&hir)?)
+    }
+
+    /// Set whether matching must be anchored at the beginning of the input.
+    ///
+    /// When enabled, a match must begin at the start of the input. When
+    /// disabled, the DFA will act as if the pattern started with a `.*?`,
+    /// which enables a match to appear anywhere.
+    ///
+    /// By default this is disabled.
+    pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+        self.anchored = yes;
+        self.nfa.anchored(yes);
+        self
+    }
+
+    /// Enable or disable the case insensitive flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `i` flag.
+    pub fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
+        self.parser.case_insensitive(yes);
+        self
+    }
+
+    /// Enable verbose mode in the regular expression.
+    ///
+    /// When enabled, verbose mode permits insigificant whitespace in many
+    /// places in the regular expression, as well as comments. Comments are
+    /// started using `#` and continue until the end of the line.
+    ///
+    /// By default, this is disabled. It may be selectively enabled in the
+    /// regular expression by using the `x` flag regardless of this setting.
+    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
+        self.parser.ignore_whitespace(yes);
+        self
+    }
+
+    /// Enable or disable the "dot matches any character" flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `s` flag.
+    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
+        self.parser.dot_matches_new_line(yes);
+        self
+    }
+
+    /// Enable or disable the "swap greed" flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `U` flag.
+    pub fn swap_greed(&mut self, yes: bool) -> &mut Builder {
+        self.parser.swap_greed(yes);
+        self
+    }
+
+    /// Enable or disable the Unicode flag (`u`) by default.
+    ///
+    /// By default this is **enabled**. It may alternatively be selectively
+    /// disabled in the regular expression itself via the `u` flag.
+    ///
+    /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
+    /// default), a regular expression will fail to parse if Unicode mode is
+    /// disabled and a sub-expression could possibly match invalid UTF-8.
+    pub fn unicode(&mut self, yes: bool) -> &mut Builder {
+        self.parser.unicode(yes);
+        self
+    }
+
+    /// When enabled, the builder will permit the construction of a regular
+    /// expression that may match invalid UTF-8.
+    ///
+    /// When disabled (the default), the builder is guaranteed to produce a
+    /// regex that will only ever match valid UTF-8 (otherwise, the builder
+    /// will return an error).
+    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
+        self.parser.allow_invalid_utf8(yes);
+        self.nfa.allow_invalid_utf8(yes);
+        self
+    }
+
+    /// Set the nesting limit used for the regular expression parser.
+    ///
+    /// The nesting limit controls how deep the abstract syntax tree is allowed
+    /// to be. If the AST exceeds the given limit (e.g., with too many nested
+    /// groups), then an error is returned by the parser.
+    ///
+    /// The purpose of this limit is to act as a heuristic to prevent stack
+    /// overflow when building a finite automaton from a regular expression's
+    /// abstract syntax tree. In particular, construction currently uses
+    /// recursion. In the future, the implementation may stop using recursion
+    /// and this option will no longer be necessary.
+    ///
+    /// This limit is not checked until the entire AST is parsed. Therefore,
+    /// if callers want to put a limit on the amount of heap space used, then
+    /// they should impose a limit on the length, in bytes, of the concrete
+    /// pattern string. In particular, this is viable since the parser will
+    /// limit itself to heap space proportional to the lenth of the pattern
+    /// string.
+    ///
+    /// Note that a nest limit of `0` will return a nest limit error for most
+    /// patterns but not all. For example, a nest limit of `0` permits `a` but
+    /// not `ab`, since `ab` requires a concatenation AST item, which results
+    /// in a nest depth of `1`. In general, a nest limit is not something that
+    /// manifests in an obvious way in the concrete syntax, therefore, it
+    /// should not be used in a granular way.
+    pub fn nest_limit(&mut self, limit: u32) -> &mut Builder {
+        self.parser.nest_limit(limit);
+        self
+    }
+
+    /// Minimize the DFA.
+    ///
+    /// When enabled, the DFA built will be minimized such that it is as small
+    /// as possible.
+    ///
+    /// Whether one enables minimization or not depends on the types of costs
+    /// you're willing to pay and how much you care about its benefits. In
+    /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+    /// space, where `n` is the number of DFA states and `k` is the alphabet
+    /// size. In practice, minimization can be quite costly in terms of both
+    /// space and time, so it should only be done if you're willing to wait
+    /// longer to produce a DFA. In general, you might want a minimal DFA in
+    /// the following circumstances:
+    ///
+    /// 1. You would like to optimize for the size of the automaton. This can
+    ///    manifest in one of two ways. Firstly, if you're converting the
+    ///    DFA into Rust code (or a table embedded in the code), then a minimal
+    ///    DFA will translate into a corresponding reduction in code  size, and
+    ///    thus, also the final compiled binary size. Secondly, if you are
+    ///    building many DFAs and putting them on the heap, you'll be able to
+    ///    fit more if they are smaller. Note though that building a minimal
+    ///    DFA itself requires additional space; you only realize the space
+    ///    savings once the minimal DFA is constructed (at which point, the
+    ///    space used for minimization is freed).
+    /// 2. You've observed that a smaller DFA results in faster match
+    ///    performance. Naively, this isn't guaranteed since there is no
+    ///    inherent difference between matching with a bigger-than-minimal
+    ///    DFA and a minimal DFA. However, a smaller DFA may make use of your
+    ///    CPU's cache more efficiently.
+    /// 3. You are trying to establish an equivalence between regular
+    ///    languages. The standard method for this is to build a minimal DFA
+    ///    for each language and then compare them. If the DFAs are equivalent
+    ///    (up to state renaming), then the languages are equivalent.
+    ///
+    /// This option is disabled by default.
+    pub fn minimize(&mut self, yes: bool) -> &mut Builder {
+        self.minimize = yes;
+        self
+    }
+
+    /// Premultiply state identifiers in the DFA's transition table.
+    ///
+    /// When enabled, state identifiers are premultiplied to point to their
+    /// corresponding row in the DFA's transition table. That is, given the
+    /// `i`th state, its corresponding premultiplied identifier is `i * k`
+    /// where `k` is the alphabet size of the DFA. (The alphabet size is at
+    /// most 256, but is in practice smaller if byte classes is enabled.)
+    ///
+    /// When state identifiers are not premultiplied, then the identifier of
+    /// the `i`th state is `i`.
+    ///
+    /// The advantage of premultiplying state identifiers is that is saves
+    /// a multiplication instruction per byte when searching with the DFA.
+    /// This has been observed to lead to a 20% performance benefit in
+    /// micro-benchmarks.
+    ///
+    /// The primary disadvantage of premultiplying state identifiers is
+    /// that they require a larger integer size to represent. For example,
+    /// if your DFA has 200 states, then its premultiplied form requires
+    /// 16 bits to represent every possible state identifier, where as its
+    /// non-premultiplied form only requires 8 bits.
+    ///
+    /// This option is enabled by default.
+    pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
+        self.premultiply = yes;
+        self
+    }
+
+    /// Shrink the size of the DFA's alphabet by mapping bytes to their
+    /// equivalence classes.
+    ///
+    /// When enabled, each DFA will use a map from all possible bytes to their
+    /// corresponding equivalence class. Each equivalence class represents a
+    /// set of bytes that does not discriminate between a match and a non-match
+    /// in the DFA. For example, the pattern `[ab]+` has at least two
+    /// equivalence classes: a set containing `a` and `b` and a set containing
+    /// every byte except for `a` and `b`. `a` and `b` are in the same
+    /// equivalence classes because they never discriminate between a match
+    /// and a non-match.
+    ///
+    /// The advantage of this map is that the size of the transition table can
+    /// be reduced drastically from `#states * 256 * sizeof(id)` to
+    /// `#states * k * sizeof(id)` where `k` is the number of equivalence
+    /// classes. As a result, total space usage can decrease substantially.
+    /// Moreover, since a smaller alphabet is used, compilation becomes faster
+    /// as well.
+    ///
+    /// The disadvantage of this map is that every byte searched must be
+    /// passed through this map before it can be used to determine the next
+    /// transition. This has a small match time performance cost.
+    ///
+    /// This option is enabled by default.
+    pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
+        self.byte_classes = yes;
+        self
+    }
+
+    /// Reverse the DFA.
+    ///
+    /// A DFA reversal is performed by reversing all of the concatenated
+    /// sub-expressions in the original pattern, recursively. The resulting
+    /// DFA can be used to match the pattern starting from the end of a string
+    /// instead of the beginning of a string.
+    ///
+    /// Generally speaking, a reversed DFA is most useful for finding the start
+    /// of a match, since a single forward DFA is only capable of finding the
+    /// end of a match. This start of match handling is done for you
+    /// automatically if you build a [`Regex`](struct.Regex.html).
+    pub fn reverse(&mut self, yes: bool) -> &mut Builder {
+        self.reverse = yes;
+        self.nfa.reverse(yes);
+        self
+    }
+
+    /// Find the longest possible match.
+    ///
+    /// This is distinct from the default leftmost-first match semantics in
+    /// that it treats all NFA states as having equivalent priority. In other
+    /// words, the longest possible match is always found and it is not
+    /// possible to implement non-greedy match semantics when this is set. That
+    /// is, `a+` and `a+?` are equivalent when this is enabled.
+    ///
+    /// In particular, a practical issue with this option at the moment is that
+    /// it prevents unanchored searches from working correctly, since
+    /// unanchored searches are implemented by prepending an non-greedy `.*?`
+    /// to the beginning of the pattern. As stated above, non-greedy match
+    /// semantics aren't supported. Therefore, if this option is enabled and
+    /// an unanchored search is requested, then building a DFA will return an
+    /// error.
+    ///
+    /// This option is principally useful when building a reverse DFA for
+    /// finding the start of a match. If you are building a regex with
+    /// [`RegexBuilder`](struct.RegexBuilder.html), then this is handled for
+    /// you automatically. The reason why this is necessary for start of match
+    /// handling is because we want to find the earliest possible starting
+    /// position of a match to satisfy leftmost-first match semantics. When
+    /// matching in reverse, this means finding the longest possible match,
+    /// hence, this option.
+    ///
+    /// By default this is disabled.
+    pub fn longest_match(&mut self, yes: bool) -> &mut Builder {
+        // There is prior art in RE2 that shows how this can support unanchored
+        // searches. Instead of treating all NFA states as having equivalent
+        // priority, we instead group NFA states into sets, and treat members
+        // of each set as having equivalent priority, but having greater
+        // priority than all following members of different sets. We then
+        // essentially assign a higher priority to everything over the prefix
+        // `.*?`.
+        self.longest_match = yes;
+        self
+    }
+
+    /// Apply best effort heuristics to shrink the NFA at the expense of more
+    /// time/memory.
+    ///
+    /// This may be exposed in the future, but for now is exported for use in
+    /// the `regex-automata-debug` tool.
+    #[doc(hidden)]
+    pub fn shrink(&mut self, yes: bool) -> &mut Builder {
+        self.nfa.shrink(yes);
+        self
+    }
+}
+
+#[cfg(feature = "std")]
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder::new()
+    }
+}
+
+/// Return the given byte as its escaped string form.
+#[cfg(feature = "std")]
+fn escape(b: u8) -> String {
+    use std::ascii;
+
+    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(all(test, feature = "std"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn errors_when_converting_to_smaller_dfa() {
+        let pattern = r"\w{10}";
+        let dfa = Builder::new()
+            .byte_classes(false)
+            .anchored(true)
+            .premultiply(false)
+            .build_with_size::<u16>(pattern)
+            .unwrap();
+        assert!(dfa.to_u8().is_err());
+    }
+
+    #[test]
+    fn errors_when_determinization_would_overflow() {
+        let pattern = r"\w{10}";
+
+        let mut builder = Builder::new();
+        builder.byte_classes(false).anchored(true).premultiply(false);
+        // using u16 is fine
+        assert!(builder.build_with_size::<u16>(pattern).is_ok());
+        // // ... but u8 results in overflow (because there are >256 states)
+        assert!(builder.build_with_size::<u8>(pattern).is_err());
+    }
+
+    #[test]
+    fn errors_when_premultiply_would_overflow() {
+        let pattern = r"[a-z]";
+
+        let mut builder = Builder::new();
+        builder.byte_classes(false).anchored(true).premultiply(false);
+        // without premultiplication is OK
+        assert!(builder.build_with_size::<u8>(pattern).is_ok());
+        // ... but with premultiplication overflows u8
+        builder.premultiply(true);
+        assert!(builder.build_with_size::<u8>(pattern).is_err());
+    }
+
+    // let data = ::std::fs::read_to_string("/usr/share/dict/words").unwrap();
+    // let mut words: Vec<&str> = data.lines().collect();
+    // println!("{} words", words.len());
+    // words.sort_by(|w1, w2| w1.len().cmp(&w2.len()).reverse());
+    // let pattern = words.join("|");
+    // print_automata_counts(&pattern);
+    // print_automata(&pattern);
+
+    // print_automata(r"[01]*1[01]{5}");
+    // print_automata(r"X(.?){0,8}Y");
+    // print_automata_counts(r"\p{alphabetic}");
+    // print_automata(r"a*b+|cdefg");
+    // print_automata(r"(..)*(...)*");
+
+    // let pattern = r"\p{any}*?\p{Other_Uppercase}";
+    // let pattern = r"\p{any}*?\w+";
+    // print_automata_counts(pattern);
+    // print_automata_counts(r"(?-u:\w)");
+
+    // let pattern = r"\p{Greek}";
+    // let pattern = r"zZzZzZzZzZ";
+    // let pattern = grapheme_pattern();
+    // let pattern = r"\p{Ideographic}";
+    // let pattern = r"\w{10}"; // 51784 --> 41264
+    // let pattern = r"\w"; // 5182
+    // let pattern = r"a*";
+    // print_automata(pattern);
+    // let (_, _, dfa) = build_automata(pattern);
+}
diff --git a/src/determinize.rs b/src/determinize.rs
new file mode 100644
index 0000000..f300316
--- /dev/null
+++ b/src/determinize.rs
@@ -0,0 +1,285 @@
+use std::collections::HashMap;
+use std::mem;
+use std::rc::Rc;
+
+use dense;
+use error::Result;
+use nfa::{self, NFA};
+use sparse_set::SparseSet;
+use state_id::{dead_id, StateID};
+
+type DFARepr<S> = dense::Repr<Vec<S>, S>;
+
+/// A determinizer converts an NFA to a DFA.
+///
+/// This determinizer follows the typical powerset construction, where each
+/// DFA state is comprised of one or more NFA states. In the worst case, there
+/// is one DFA state for every possible combination of NFA states. In practice,
+/// this only happens in certain conditions, typically when there are bounded
+/// repetitions.
+///
+/// The type variable `S` refers to the chosen state identifier representation
+/// used for the DFA.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA being
+/// converted to a DFA.
+#[derive(Debug)]
+pub(crate) struct Determinizer<'a, S: StateID> {
+    /// The NFA we're converting into a DFA.
+    nfa: &'a NFA,
+    /// The DFA we're building.
+    dfa: DFARepr<S>,
+    /// Each DFA state being built is defined as an *ordered* set of NFA
+    /// states, along with a flag indicating whether the state is a match
+    /// state or not.
+    ///
+    /// This is never empty. The first state is always a dummy state such that
+    /// a state id == 0 corresponds to a dead state.
+    builder_states: Vec<Rc<State>>,
+    /// A cache of DFA states that already exist and can be easily looked up
+    /// via ordered sets of NFA states.
+    cache: HashMap<Rc<State>, S>,
+    /// Scratch space for a stack of NFA states to visit, for depth first
+    /// visiting without recursion.
+    stack: Vec<nfa::StateID>,
+    /// Scratch space for storing an ordered sequence of NFA states, for
+    /// amortizing allocation.
+    scratch_nfa_states: Vec<nfa::StateID>,
+    /// Whether to build a DFA that finds the longest possible match.
+    longest_match: bool,
+}
+
+/// An intermediate representation for a DFA state during determinization.
+#[derive(Debug, Eq, Hash, PartialEq)]
+struct State {
+    /// Whether this state is a match state or not.
+    is_match: bool,
+    /// An ordered sequence of NFA states that make up this DFA state.
+    nfa_states: Vec<nfa::StateID>,
+}
+
+impl<'a, S: StateID> Determinizer<'a, S> {
+    /// Create a new determinizer for converting the given NFA to a DFA.
+    pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
+        let dead = Rc::new(State::dead());
+        let mut cache = HashMap::default();
+        cache.insert(dead.clone(), dead_id());
+
+        Determinizer {
+            nfa,
+            dfa: DFARepr::empty().anchored(nfa.is_anchored()),
+            builder_states: vec![dead],
+            cache,
+            stack: vec![],
+            scratch_nfa_states: vec![],
+            longest_match: false,
+        }
+    }
+
+    /// Instruct the determinizer to use equivalence classes as the transition
+    /// alphabet instead of all possible byte values.
+    pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
+        let byte_classes = self.nfa.byte_classes().clone();
+        self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
+            .anchored(self.nfa.is_anchored());
+        self
+    }
+
+    /// Instruct the determinizer to build a DFA that recognizes the longest
+    /// possible match instead of the leftmost first match. This is useful when
+    /// constructing reverse DFAs for finding the start of a match.
+    pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
+        self.longest_match = yes;
+        self
+    }
+
+    /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+    /// the chosen state identifier representation is too small), then an error
+    /// is returned.
+    pub fn build(mut self) -> Result<DFARepr<S>> {
+        let representative_bytes: Vec<u8> =
+            self.dfa.byte_classes().representatives().collect();
+        let mut sparse = self.new_sparse_set();
+        let mut uncompiled = vec![self.add_start(&mut sparse)?];
+        while let Some(dfa_id) = uncompiled.pop() {
+            for &b in &representative_bytes {
+                let (next_dfa_id, is_new) =
+                    self.cached_state(dfa_id, b, &mut sparse)?;
+                self.dfa.add_transition(dfa_id, b, next_dfa_id);
+                if is_new {
+                    uncompiled.push(next_dfa_id);
+                }
+            }
+        }
+
+        // At this point, we shuffle the matching states in the final DFA to
+        // the beginning. This permits a DFA's match loop to detect a match
+        // condition by merely inspecting the current state's identifier, and
+        // avoids the need for any additional auxiliary storage.
+        let is_match: Vec<bool> =
+            self.builder_states.iter().map(|s| s.is_match).collect();
+        self.dfa.shuffle_match_states(&is_match);
+        Ok(self.dfa)
+    }
+
+    /// Return the identifier for the next DFA state given an existing DFA
+    /// state and an input byte. If the next DFA state already exists, then
+    /// return its identifier from the cache. Otherwise, build the state, cache
+    /// it and return its identifier.
+    ///
+    /// The given sparse set is used for scratch space. It must have a capacity
+    /// equivalent to the total number of NFA states, but its contents are
+    /// otherwise unspecified.
+    ///
+    /// This routine returns a boolean indicating whether a new state was
+    /// built. If a new state is built, then the caller needs to add it to its
+    /// frontier of uncompiled DFA states to compute transitions for.
+    fn cached_state(
+        &mut self,
+        dfa_id: S,
+        b: u8,
+        sparse: &mut SparseSet,
+    ) -> Result<(S, bool)> {
+        sparse.clear();
+        // Compute the set of all reachable NFA states, including epsilons.
+        self.next(dfa_id, b, sparse);
+        // Build a candidate state and check if it has already been built.
+        let state = self.new_state(sparse);
+        if let Some(&cached_id) = self.cache.get(&state) {
+            // Since we have a cached state, put the constructed state's
+            // memory back into our scratch space, so that it can be reused.
+            mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
+            return Ok((cached_id, false));
+        }
+        // Nothing was in the cache, so add this state to the cache.
+        self.add_state(state).map(|s| (s, true))
+    }
+
+    /// Compute the set of all eachable NFA states, including the full epsilon
+    /// closure, from a DFA state for a single byte of input.
+    fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
+        next_nfa_states.clear();
+        for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
+            let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
+            match *self.nfa.state(nfa_id) {
+                nfa::State::Union { .. }
+                | nfa::State::Fail
+                | nfa::State::Match => {}
+                nfa::State::Range { range: ref r } => {
+                    if r.start <= b && b <= r.end {
+                        self.epsilon_closure(r.next, next_nfa_states);
+                    }
+                }
+                nfa::State::Sparse { ref ranges } => {
+                    for r in ranges.iter() {
+                        if r.start > b {
+                            break;
+                        } else if r.start <= b && b <= r.end {
+                            self.epsilon_closure(r.next, next_nfa_states);
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Compute the epsilon closure for the given NFA state.
+    fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
+        if !self.nfa.state(start).is_epsilon() {
+            set.insert(start);
+            return;
+        }
+
+        self.stack.push(start);
+        while let Some(mut id) = self.stack.pop() {
+            loop {
+                if set.contains(id) {
+                    break;
+                }
+                set.insert(id);
+                match *self.nfa.state(id) {
+                    nfa::State::Range { .. }
+                    | nfa::State::Sparse { .. }
+                    | nfa::State::Fail
+                    | nfa::State::Match => break,
+                    nfa::State::Union { ref alternates } => {
+                        id = match alternates.get(0) {
+                            None => break,
+                            Some(&id) => id,
+                        };
+                        self.stack.extend(alternates[1..].iter().rev());
+                    }
+                }
+            }
+        }
+    }
+
+    /// Compute the initial DFA state and return its identifier.
+    ///
+    /// The sparse set given is used for scratch space, and must have capacity
+    /// equal to the total number of NFA states. Its contents are unspecified.
+    fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
+        sparse.clear();
+        self.epsilon_closure(self.nfa.start(), sparse);
+        let state = self.new_state(&sparse);
+        let id = self.add_state(state)?;
+        self.dfa.set_start_state(id);
+        Ok(id)
+    }
+
+    /// Add the given state to the DFA and make it available in the cache.
+    ///
+    /// The state initially has no transitions. That is, it transitions to the
+    /// dead state for all possible inputs.
+    fn add_state(&mut self, state: State) -> Result<S> {
+        let id = self.dfa.add_empty_state()?;
+        let rstate = Rc::new(state);
+        self.builder_states.push(rstate.clone());
+        self.cache.insert(rstate, id);
+        Ok(id)
+    }
+
+    /// Convert the given set of ordered NFA states to a DFA state.
+    fn new_state(&mut self, set: &SparseSet) -> State {
+        let mut state = State {
+            is_match: false,
+            nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
+        };
+        state.nfa_states.clear();
+
+        for &id in set {
+            match *self.nfa.state(id) {
+                nfa::State::Range { .. } => {
+                    state.nfa_states.push(id);
+                }
+                nfa::State::Sparse { .. } => {
+                    state.nfa_states.push(id);
+                }
+                nfa::State::Fail => {
+                    break;
+                }
+                nfa::State::Match => {
+                    state.is_match = true;
+                    if !self.longest_match {
+                        break;
+                    }
+                }
+                nfa::State::Union { .. } => {}
+            }
+        }
+        state
+    }
+
+    /// Create a new sparse set with enough capacity to hold all NFA states.
+    fn new_sparse_set(&self) -> SparseSet {
+        SparseSet::new(self.nfa.len())
+    }
+}
+
+impl State {
+    /// Create a new empty dead state.
+    fn dead() -> State {
+        State { nfa_states: vec![], is_match: false }
+    }
+}
diff --git a/src/dfa.rs b/src/dfa.rs
new file mode 100644
index 0000000..43de346
--- /dev/null
+++ b/src/dfa.rs
@@ -0,0 +1,363 @@
+use state_id::StateID;
+
+/// A trait describing the interface of a deterministic finite automaton (DFA).
+///
+/// Every DFA has exactly one start state and at least one dead state (which
+/// may be the same, as in the case of an empty DFA). In all cases, a state
+/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
+/// always returns `true`.
+///
+/// Every DFA also has zero or more match states, such that
+/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
+/// a match state.
+///
+/// In general, users of this trait likely will only need to use the search
+/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
+/// methods are lower level and are used for walking the transitions of a DFA
+/// manually. In particular, the aforementioned search routines are implemented
+/// generically in terms of the lower level transition walking routines.
+pub trait DFA {
+    /// The representation used for state identifiers in this DFA.
+    ///
+    /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
+    type ID: StateID;
+
+    /// Return the identifier of this DFA's start state.
+    fn start_state(&self) -> Self::ID;
+
+    /// Returns true if and only if the given identifier corresponds to a match
+    /// state.
+    fn is_match_state(&self, id: Self::ID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a dead
+    /// state. When a DFA enters a dead state, it is impossible to leave and
+    /// thus can never lead to a match.
+    fn is_dead_state(&self, id: Self::ID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to either
+    /// a dead state or a match state, such that one of `is_match_state(id)`
+    /// or `is_dead_state(id)` must return true.
+    ///
+    /// Depending on the implementation of the DFA, this routine can be used
+    /// to save a branch in the core matching loop. Nevertheless,
+    /// `is_match_state(id) || is_dead_state(id)` is always a valid
+    /// implementation.
+    fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
+
+    /// Returns true if and only if this DFA is anchored.
+    ///
+    /// When a DFA is anchored, it is only allowed to report matches that
+    /// start at index `0`.
+    fn is_anchored(&self) -> bool;
+
+    /// Given the current state that this DFA is in and the next input byte,
+    /// this method returns the identifier of the next state. The identifier
+    /// returned is always valid, but it may correspond to a dead state.
+    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
+
+    /// Like `next_state`, but its implementation may look up the next state
+    /// without memory safety checks such as bounds checks. As such, callers
+    /// must ensure that the given identifier corresponds to a valid DFA
+    /// state. Implementors must, in turn, ensure that this routine is safe
+    /// for all valid state identifiers and for all possible `u8` values.
+    unsafe fn next_state_unchecked(
+        &self,
+        current: Self::ID,
+        input: u8,
+    ) -> Self::ID;
+
+    /// Returns true if and only if the given bytes match this DFA.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if a DFA enters
+    /// a match state or a dead state, then this routine will return `true` or
+    /// `false`, respectively, without inspecting any future input.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`DenseDFA`](enum.DenseDFA.html).
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
+    /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
+    /// assert_eq!(false, dfa.is_match(b"foobar"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    #[inline]
+    fn is_match(&self, bytes: &[u8]) -> bool {
+        self.is_match_at(bytes, 0)
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`DenseDFA`](enum.DenseDFA.html).
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa = DenseDFA::new("foo[0-9]+")?;
+    /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
+    ///
+    /// // Normally, the end of the leftmost first match here would be 3,
+    /// // but the shortest match semantics detect a match earlier.
+    /// let dfa = DenseDFA::new("abc|a")?;
+    /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    #[inline]
+    fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
+        self.shortest_match_at(bytes, 0)
+    }
+
+    /// Returns the end offset of the longest match. If no match exists,
+    /// then `None` is returned.
+    ///
+    /// Implementors of this trait are not required to implement any particular
+    /// match semantics (such as leftmost-first), which are instead manifest in
+    /// the DFA's topology itself.
+    ///
+    /// In particular, this method must continue searching even after it
+    /// enters a match state. The search should only terminate once it has
+    /// reached the end of the input or when it has entered a dead state. Upon
+    /// termination, the position of the last byte seen while still in a match
+    /// state is returned.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
+    /// "leftmost first" match semantics.
+    ///
+    /// Leftmost first match semantics corresponds to the match with the
+    /// smallest starting offset, but where the end offset is determined by
+    /// preferring earlier branches in the original regular expression. For
+    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+    /// will match `Samwise` in `Samwise`.
+    ///
+    /// Generally speaking, the "leftmost first" match is how most backtracking
+    /// regular expressions tend to work. This is in contrast to POSIX-style
+    /// regular expressions that yield "leftmost longest" matches. Namely,
+    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+    /// leftmost longest semantics.
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa = DenseDFA::new("foo[0-9]+")?;
+    /// assert_eq!(Some(8), dfa.find(b"foo12345"));
+    ///
+    /// // Even though a match is found after reading the first byte (`a`),
+    /// // the leftmost first match semantics demand that we find the earliest
+    /// // match that prefers earlier parts of the pattern over latter parts.
+    /// let dfa = DenseDFA::new("abc|a")?;
+    /// assert_eq!(Some(3), dfa.find(b"abc"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    #[inline]
+    fn find(&self, bytes: &[u8]) -> Option<usize> {
+        self.find_at(bytes, 0)
+    }
+
+    /// Returns the start offset of the longest match in reverse, by searching
+    /// from the end of the input towards the start of the input. If no match
+    /// exists, then `None` is returned. In other words, this has the same
+    /// match semantics as `find`, but in reverse.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
+    /// is principally useful when used in conjunction with the
+    /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
+    /// configuration knob. In general, it's unlikely to be correct to use both
+    /// `find` and `rfind` with the same DFA since any particular DFA will only
+    /// support searching in one direction.
+    ///
+    /// ```
+    /// use regex_automata::{dense, DFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
+    /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    #[inline]
+    fn rfind(&self, bytes: &[u8]) -> Option<usize> {
+        self.rfind_at(bytes, bytes.len())
+    }
+
+    /// Returns the same as `is_match`, but starts the search at the given
+    /// offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    #[inline]
+    fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
+        if self.is_anchored() && start > 0 {
+            return false;
+        }
+
+        let mut state = self.start_state();
+        if self.is_match_or_dead_state(state) {
+            return self.is_match_state(state);
+        }
+        for &b in bytes[start..].iter() {
+            state = unsafe { self.next_state_unchecked(state, b) };
+            if self.is_match_or_dead_state(state) {
+                return self.is_match_state(state);
+            }
+        }
+        false
+    }
+
+    /// Returns the same as `shortest_match`, but starts the search at the
+    /// given offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    #[inline]
+    fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        if self.is_anchored() && start > 0 {
+            return None;
+        }
+
+        let mut state = self.start_state();
+        if self.is_match_or_dead_state(state) {
+            return if self.is_dead_state(state) { None } else { Some(start) };
+        }
+        for (i, &b) in bytes[start..].iter().enumerate() {
+            state = unsafe { self.next_state_unchecked(state, b) };
+            if self.is_match_or_dead_state(state) {
+                return if self.is_dead_state(state) {
+                    None
+                } else {
+                    Some(start + i + 1)
+                };
+            }
+        }
+        None
+    }
+
+    /// Returns the same as `find`, but starts the search at the given
+    /// offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    #[inline]
+    fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        if self.is_anchored() && start > 0 {
+            return None;
+        }
+
+        let mut state = self.start_state();
+        let mut last_match = if self.is_dead_state(state) {
+            return None;
+        } else if self.is_match_state(state) {
+            Some(start)
+        } else {
+            None
+        };
+        for (i, &b) in bytes[start..].iter().enumerate() {
+            state = unsafe { self.next_state_unchecked(state, b) };
+            if self.is_match_or_dead_state(state) {
+                if self.is_dead_state(state) {
+                    return last_match;
+                }
+                last_match = Some(start + i + 1);
+            }
+        }
+        last_match
+    }
+
+    /// Returns the same as `rfind`, but starts the search at the given
+    /// offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == bytes.len()`.
+    #[inline(never)]
+    fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        if self.is_anchored() && start < bytes.len() {
+            return None;
+        }
+
+        let mut state = self.start_state();
+        let mut last_match = if self.is_dead_state(state) {
+            return None;
+        } else if self.is_match_state(state) {
+            Some(start)
+        } else {
+            None
+        };
+        for (i, &b) in bytes[..start].iter().enumerate().rev() {
+            state = unsafe { self.next_state_unchecked(state, b) };
+            if self.is_match_or_dead_state(state) {
+                if self.is_dead_state(state) {
+                    return last_match;
+                }
+                last_match = Some(i);
+            }
+        }
+        last_match
+    }
+}
+
+impl<'a, T: DFA> DFA for &'a T {
+    type ID = T::ID;
+
+    #[inline]
+    fn start_state(&self) -> Self::ID {
+        (**self).start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: Self::ID) -> bool {
+        (**self).is_match_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
+        (**self).is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: Self::ID) -> bool {
+        (**self).is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        (**self).is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
+        (**self).next_state(current, input)
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(
+        &self,
+        current: Self::ID,
+        input: u8,
+    ) -> Self::ID {
+        (**self).next_state_unchecked(current, input)
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..70fe436
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,150 @@
+use std::error;
+use std::fmt;
+use std::result;
+
+use regex_syntax;
+
+pub type Result<T> = result::Result<T, Error>;
+
+/// An error that occurred during the construction of a DFA.
+#[derive(Clone, Debug)]
+pub struct Error {
+    kind: ErrorKind,
+}
+
+/// The kind of error that occurred.
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+    /// An error that occurred while parsing a regular expression. Note that
+    /// this error may be printed over multiple lines, and is generally
+    /// intended to be end user readable on its own.
+    Syntax(String),
+    /// An error that occurred because an unsupported regex feature was used.
+    /// The message string describes which unsupported feature was used.
+    ///
+    /// The primary regex features that are unsupported are those that require
+    /// look-around, such as the `^` and `$` anchors and the word boundary
+    /// assertion `\b`. These may be supported in the future.
+    Unsupported(String),
+    /// An error that occurred when attempting to serialize a DFA to bytes.
+    Serialize(String),
+    /// An error that occurs when constructing a DFA would require the use of
+    /// a state ID that overflows the chosen state ID representation. For
+    /// example, if one is using `u8` for state IDs and builds a DFA with
+    /// 257 states, then the last state's ID will be `256` which cannot be
+    /// represented with `u8`.
+    ///
+    /// Typically, this error occurs in the determinization process of building
+    /// a DFA (the conversion step from NFA to DFA). It can also occur when
+    /// trying to build a smaller DFA from an existing one.
+    StateIDOverflow {
+        /// The maximum possible state ID.
+        max: usize,
+    },
+    /// An error that occurs when premultiplication of state IDs is requested,
+    /// but doing so would overflow the chosen state ID representation.
+    ///
+    /// When `max == requested_max`, then the state ID would overflow `usize`.
+    PremultiplyOverflow {
+        /// The maximum possible state id.
+        max: usize,
+        /// The maximum ID required by premultiplication.
+        requested_max: usize,
+    },
+}
+
+impl Error {
+    /// Return the kind of this error.
+    pub fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+
+    pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
+        Error { kind: ErrorKind::Syntax(err.to_string()) }
+    }
+
+    pub(crate) fn unsupported_anchor() -> Error {
+        let msg = r"anchors such as ^, $, \A and \z are not supported";
+        Error { kind: ErrorKind::Unsupported(msg.to_string()) }
+    }
+
+    pub(crate) fn unsupported_word() -> Error {
+        let msg = r"word boundary assertions (\b and \B) are not supported";
+        Error { kind: ErrorKind::Unsupported(msg.to_string()) }
+    }
+
+    pub(crate) fn unsupported_longest_match() -> Error {
+        let msg = "unachored searches with longest match \
+                   semantics are not supported";
+        Error { kind: ErrorKind::Unsupported(msg.to_string()) }
+    }
+
+    pub(crate) fn serialize(message: &str) -> Error {
+        Error { kind: ErrorKind::Serialize(message.to_string()) }
+    }
+
+    pub(crate) fn state_id_overflow(max: usize) -> Error {
+        Error { kind: ErrorKind::StateIDOverflow { max } }
+    }
+
+    pub(crate) fn premultiply_overflow(
+        max: usize,
+        requested_max: usize,
+    ) -> Error {
+        Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
+    }
+}
+
+impl error::Error for Error {
+    fn description(&self) -> &str {
+        match self.kind {
+            ErrorKind::Syntax(_) => "syntax error",
+            ErrorKind::Unsupported(_) => "unsupported syntax",
+            ErrorKind::Serialize(_) => "serialization error",
+            ErrorKind::StateIDOverflow { .. } => {
+                "state id representation too small"
+            }
+            ErrorKind::PremultiplyOverflow { .. } => {
+                "state id representation too small for premultiplication"
+            }
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.kind {
+            ErrorKind::Syntax(ref msg) => write!(f, "{}", msg),
+            ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg),
+            ErrorKind::Serialize(ref msg) => {
+                write!(f, "DFA serialization error: {}", msg)
+            }
+            ErrorKind::StateIDOverflow { max } => write!(
+                f,
+                "building the DFA failed because it required building \
+                 more states that can be identified, where the maximum \
+                 ID for the chosen representation is {}",
+                max,
+            ),
+            ErrorKind::PremultiplyOverflow { max, requested_max } => {
+                if max == requested_max {
+                    write!(
+                        f,
+                        "premultiplication of states requires the ability to \
+                         represent a state ID greater than what can fit on \
+                         this platform's usize, which is {}",
+                        ::std::usize::MAX,
+                    )
+                } else {
+                    write!(
+                        f,
+                        "premultiplication of states requires the ability to \
+                         represent at least a state ID of {}, but the chosen \
+                         representation only permits a maximum state ID of {}",
+                        requested_max, max,
+                    )
+                }
+            }
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..4d3e9c1
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,360 @@
+/*!
+A low level regular expression library that uses deterministic finite automata.
+It supports a rich syntax with Unicode support, has extensive options for
+configuring the best space vs time trade off for your use case and provides
+support for cheap deserialization of automata for use in `no_std` environments.
+
+# Overview
+
+This section gives a brief overview of the primary types in this crate:
+
+* A [`Regex`](struct.Regex.html) provides a way to search for matches of a
+  regular expression. This includes iterating over matches with both the start
+  and end positions of each match.
+* A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many
+  compilation options for a regex.
+* A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that
+  uses a dense representation (uses lots of space, but fast searching).
+* A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`,
+  but uses a sparse representation (uses less space, but slower matching).
+* A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must
+  implement.
+* Both dense DFAs and sparse DFAs support
+  [serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian)
+  and
+  [cheap deserialization](enum.DenseDFA.html#method.from_bytes).
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::Regex;
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+```
+
+# Example: use sparse DFAs
+
+By default, compiling a regex will use dense DFAs internally. This uses more
+memory, but executes searches more quickly. If you can abide slower searches
+(somewhere around 3-5x), then sparse DFAs might make more sense since they can
+use significantly less space.
+
+Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
+`Regex::new`:
+
+```
+use regex_automata::Regex;
+
+# fn example() -> Result<(), regex_automata::Error> {
+let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+If you already have dense DFAs for some reason, they can be converted to sparse
+DFAs and used to build a new `Regex`. For example:
+
+```
+use regex_automata::Regex;
+
+# fn example() -> Result<(), regex_automata::Error> {
+let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let sparse_re = Regex::from_dfas(
+    dense_re.forward().to_sparse()?,
+    dense_re.reverse().to_sparse()?,
+);
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+# Example: deserialize a DFA
+
+This shows how to first serialize a DFA into raw bytes, and then deserialize
+those raw bytes back into a DFA. While this particular example is a bit
+contrived, this same technique can be used in your program to deserialize a
+DFA at start up time or by memory mapping a file. In particular,
+deserialization is guaranteed to be cheap because it will always be a constant
+time operation.
+
+```
+use regex_automata::{DenseDFA, Regex};
+
+# fn example() -> Result<(), regex_automata::Error> {
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both the forward and reverse DFAs, see note below
+let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?;
+let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?;
+// now deserialize both---we need to specify the correct type!
+let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) };
+let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) };
+// finally, reconstruct our regex
+let re2 = Regex::from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+There are a few points worth noting here:
+
+* We need to extract the raw DFAs used by the regex and serialize those. You
+  can build the DFAs manually yourself using
+  [`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a
+  `Regex` guarantees that the DFAs are built correctly.
+* We specifically convert the dense DFA to a representation that uses `u16`
+  for its state identifiers using
+  [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't
+  strictly necessary, if we skipped this step, then the serialized bytes would
+  use `usize` for state identifiers, which does not have a fixed size. Using
+  `u16` ensures that we can deserialize this DFA even on platforms with a
+  smaller pointer size. If our DFA is too big for `u16` state identifiers, then
+  one can use `u32` or `u64`.
+* To convert the DFA to raw bytes, we use the `to_bytes_native_endian`
+  method. In practice, you'll want to use either
+  [`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
+  or
+  [`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian),
+  depending on which platform you're deserializing your DFA from. If you intend
+  to deserialize on either platform, then you'll need to serialize both and
+  deserialize the right one depending on your target's endianness.
+* Deserializing a DFA requires the use of `unsafe` because the raw bytes must
+  be *trusted*. In particular, while some degree of sanity checks are
+  performed, nothing guarantees the integrity of the DFA's transition table
+  since deserialization is a constant time operation. Since searching with a
+  DFA must be able to follow transitions blindly for performance reasons,
+  giving incorrect bytes to the deserialization API can result in memory
+  unsafety.
+
+The same process can be achieved with sparse DFAs as well:
+
+```
+use regex_automata::{SparseDFA, Regex};
+
+# fn example() -> Result<(), regex_automata::Error> {
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both
+let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
+let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
+// now deserialize both---we need to specify the correct type!
+let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) };
+let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) };
+// finally, reconstruct our regex
+let re2 = Regex::from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+# Ok(()) }; example().unwrap()
+```
+
+Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
+Conversely, dense DFAs must be be aligned to the same alignment as their
+state identifier representation.
+
+# Support for `no_std`
+
+This crate comes with a `std` feature that is enabled by default. When the
+`std` feature is enabled, the API of this crate will include the facilities
+necessary for compiling, serializing, deserializing and searching with regular
+expressions. When the `std` feature is disabled, the API of this crate will
+shrink such that it only includes the facilities necessary for deserializing
+and searching with regular expressions.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `std` feature that compiles and serializes a
+  regular expression. Serialization should only happen after first converting
+  the DFAs to use a fixed size state identifier instead of the default `usize`.
+  You may also need to serialize both little and big endian versions of each
+  DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+  your previously serialized DFAs into regexes. You can then search with them
+  as you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+Note that the
+[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
+tool will do the first step for you with its `dfa` or `regex` sub-commands.
+
+# Syntax
+
+This crate supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax).
+
+Currently, there are a couple limitations. In general, this crate does not
+support zero-width assertions, although they may be added in the future. This
+includes:
+
+* Anchors such as `^`, `$`, `\A` and `\z`.
+* Word boundary assertions such as `\b` and `\B`.
+
+It is possible to run a search that is anchored at the beginning of the input.
+To do that, set the
+[`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored)
+option when building a regex. By default, all searches are unanchored.
+
+# Differences with the regex crate
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this crate provides a lower level
+regular expression interface that is a bit less convenient while providing more
+explicit control over memory usage and search times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+  of the regex pattern. While most patterns do not exhibit worst case
+  exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
+  build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
+  not be compiled with this library. (In the future, the API may expose an
+  option to return an error if the DFA gets too big.)
+* This crate does not support sub-match extraction, which can be achieved with
+  the regex crate's "captures" API. This may be added in the future, but is
+  unlikely.
+* While the regex crate doesn't necessarily sport fast compilation times, the
+  regexes in this crate are almost universally slow to compile, especially when
+  they contain large Unicode character classes. For example, on my system,
+  compiling `\w{3}` with byte classes enabled takes just over 1 second and
+  almost 5MB of memory! (Compiling a sparse regex takes about the same time
+  but only uses about 500KB of memory.) Conversly, compiling the same regex
+  without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
+  less than 5KB of memory. For this reason, you should only use Unicode
+  character classes if you absolutely need them!
+* This crate does not support regex sets.
+* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
+  `\B`.
+* As a lower level crate, this library does not do literal optimizations. In
+  exchange, you get predictable performance regardless of input. The
+  philosophy here is that literal optimizations should be applied at a higher
+  level, although there is no easy support for this in the ecosystem yet.
+* There is no `&str` API like in the regex crate. In this crate, all APIs
+  operate on `&[u8]`. By default, match indices are guaranteed to fall on
+  UTF-8 boundaries, unless
+  [`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
+  is enabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+  deserialized. Deserialization always takes constant time since searching can
+  be performed directly on the raw serialized bytes of a DFA.
+* This crate was specifically designed so that the searching phase of a DFA has
+  minimal runtime requirements, and can therefore be used in `no_std`
+  environments. While `no_std` environments cannot compile regexes, they can
+  deserialize pre-compiled regexes.
+* Since this crate builds DFAs ahead of time, it will generally out-perform
+  the `regex` crate on equivalent tasks. The performance difference is likely
+  not large. However, because of a complex set of optimizations in the regex
+  crate (like literal optimizations), an accurate performance comparison may be
+  difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+  performance a small amount, but uses much less storage space. Potentially
+  even less than what the regex crate uses.
+* This crate exposes DFAs directly, such as
+  [`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html),
+  which enables one to do less work in some cases. For example, if you only
+  need the end of a match and not the start of a match, then you can use a DFA
+  directly without building a `Regex`, which always requires a second DFA to
+  find the start of a match.
+* Aside from choosing between dense and sparse DFAs, there are several options
+  for configuring the space usage vs search time trade off. These include
+  things like choosing a smaller state identifier representation, to
+  premultiplying state identifiers and splitting a DFA's alphabet into
+  equivalence classes. Finally, DFA minimization is also provided, but can
+  increase compilation times dramatically.
+*/
+
+#![deny(missing_docs)]
+#![cfg_attr(not(feature = "std"), no_std)]
+
+#[cfg(feature = "std")]
+extern crate core;
+
+#[cfg(all(test, feature = "transducer"))]
+extern crate bstr;
+extern crate byteorder;
+#[cfg(feature = "transducer")]
+extern crate fst;
+#[cfg(feature = "std")]
+extern crate regex_syntax;
+
+pub use dense::DenseDFA;
+pub use dfa::DFA;
+#[cfg(feature = "std")]
+pub use error::{Error, ErrorKind};
+pub use regex::Regex;
+#[cfg(feature = "std")]
+pub use regex::RegexBuilder;
+pub use sparse::SparseDFA;
+pub use state_id::StateID;
+
+mod classes;
+#[path = "dense.rs"]
+mod dense_imp;
+#[cfg(feature = "std")]
+mod determinize;
+mod dfa;
+#[cfg(feature = "std")]
+mod error;
+#[cfg(feature = "std")]
+mod minimize;
+#[cfg(feature = "std")]
+#[doc(hidden)]
+pub mod nfa;
+mod regex;
+#[path = "sparse.rs"]
+mod sparse_imp;
+#[cfg(feature = "std")]
+mod sparse_set;
+mod state_id;
+#[cfg(feature = "transducer")]
+mod transducer;
+
+/// Types and routines specific to dense DFAs.
+///
+/// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its
+/// corresponding variant DFA types, such as [`Standard`](struct.Standard.html)
+/// and [`ByteClass`](struct.ByteClass.html).
+///
+/// This module also contains a [builder](struct.Builder.html) for
+/// configuring the construction of a dense DFA.
+pub mod dense {
+    pub use dense_imp::*;
+}
+
+/// Types and routines specific to sparse DFAs.
+///
+/// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of
+/// its corresponding variant DFA types, such as
+/// [`Standard`](struct.Standard.html) and
+/// [`ByteClass`](struct.ByteClass.html).
+///
+/// Unlike the [`dense`](../dense/index.html) module, this module does not
+/// contain a builder specific for sparse DFAs. Instead, the intended way to
+/// build a sparse DFA is either by using a default configuration with its
+/// [constructor](enum.SparseDFA.html#method.new),
+/// or by first
+/// [configuring the construction of a dense DFA](../dense/struct.Builder.html)
+/// and then calling
+/// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse).
+pub mod sparse {
+    pub use sparse_imp::*;
+}
diff --git a/src/minimize.rs b/src/minimize.rs
new file mode 100644
index 0000000..ededa5f
--- /dev/null
+++ b/src/minimize.rs
@@ -0,0 +1,373 @@
+use std::cell::RefCell;
+use std::fmt;
+use std::mem;
+use std::rc::Rc;
+
+use dense;
+use state_id::{dead_id, StateID};
+
+type DFARepr<S> = dense::Repr<Vec<S>, S>;
+
+/// An implementation of Hopcroft's algorithm for minimizing DFAs.
+///
+/// The algorithm implemented here is mostly taken from Wikipedia:
+/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
+///
+/// This code has had some light optimization attention paid to it,
+/// particularly in the form of reducing allocation as much as possible.
+/// However, it is still generally slow. Future optimization work should
+/// probably focus on the bigger picture rather than micro-optimizations. For
+/// example:
+///
+/// 1. Figure out how to more intelligently create initial partitions. That is,
+///    Hopcroft's algorithm starts by creating two partitions of DFA states
+///    that are known to NOT be equivalent: match states and non-match states.
+///    The algorithm proceeds by progressively refining these partitions into
+///    smaller partitions. If we could start with more partitions, then we
+///    could reduce the amount of work that Hopcroft's algorithm needs to do.
+/// 2. For every partition that we visit, we find all incoming transitions to
+///    every state in the partition for *every* element in the alphabet. (This
+///    is why using byte classes can significantly decrease minimization times,
+///    since byte classes shrink the alphabet.) This is quite costly and there
+///    is perhaps some redundant work being performed depending on the specific
+///    states in the set. For example, we might be able to only visit some
+///    elements of the alphabet based on the transitions.
+/// 3. Move parts of minimization into determinization. If minimization has
+///    fewer states to deal with, then it should run faster. A prime example
+///    of this might be large Unicode classes, which are generated in way that
+///    can create a lot of redundant states. (Some work has been done on this
+///    point during NFA compilation via the algorithm described in the
+///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
+///    paper.)
+pub(crate) struct Minimizer<'a, S: 'a> {
+    dfa: &'a mut DFARepr<S>,
+    in_transitions: Vec<Vec<Vec<S>>>,
+    partitions: Vec<StateSet<S>>,
+    waiting: Vec<StateSet<S>>,
+}
+
+impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("Minimizer")
+            .field("dfa", &self.dfa)
+            .field("in_transitions", &self.in_transitions)
+            .field("partitions", &self.partitions)
+            .field("waiting", &self.waiting)
+            .finish()
+    }
+}
+
+/// A set of states. A state set makes up a single partition in Hopcroft's
+/// algorithm.
+///
+/// It is represented by an ordered set of state identifiers. We use shared
+/// ownership so that a single state set can be in both the set of partitions
+/// and in the set of waiting sets simultaneously without an additional
+/// allocation. Generally, once a state set is built, it becomes immutable.
+///
+/// We use this representation because it avoids the overhead of more
+/// traditional set data structures (HashSet/BTreeSet), and also because
+/// computing intersection/subtraction on this representation is especially
+/// fast.
+#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+struct StateSet<S>(Rc<RefCell<Vec<S>>>);
+
+impl<'a, S: StateID> Minimizer<'a, S> {
+    pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
+        let in_transitions = Minimizer::incoming_transitions(dfa);
+        let partitions = Minimizer::initial_partitions(dfa);
+        let waiting = vec![partitions[0].clone()];
+
+        Minimizer { dfa, in_transitions, partitions, waiting }
+    }
+
+    pub fn run(mut self) {
+        let mut incoming = StateSet::empty();
+        let mut scratch1 = StateSet::empty();
+        let mut scratch2 = StateSet::empty();
+        let mut newparts = vec![];
+
+        while let Some(set) = self.waiting.pop() {
+            for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
+                self.find_incoming_to(b, &set, &mut incoming);
+
+                for p in 0..self.partitions.len() {
+                    self.partitions[p].intersection(&incoming, &mut scratch1);
+                    if scratch1.is_empty() {
+                        newparts.push(self.partitions[p].clone());
+                        continue;
+                    }
+
+                    self.partitions[p].subtract(&incoming, &mut scratch2);
+                    if scratch2.is_empty() {
+                        newparts.push(self.partitions[p].clone());
+                        continue;
+                    }
+
+                    let (x, y) =
+                        (scratch1.deep_clone(), scratch2.deep_clone());
+                    newparts.push(x.clone());
+                    newparts.push(y.clone());
+                    match self.find_waiting(&self.partitions[p]) {
+                        Some(i) => {
+                            self.waiting[i] = x;
+                            self.waiting.push(y);
+                        }
+                        None => {
+                            if x.len() <= y.len() {
+                                self.waiting.push(x);
+                            } else {
+                                self.waiting.push(y);
+                            }
+                        }
+                    }
+                }
+                newparts = mem::replace(&mut self.partitions, newparts);
+                newparts.clear();
+            }
+        }
+
+        // At this point, we now have a minimal partitioning of states, where
+        // each partition is an equivalence class of DFA states. Now we need to
+        // use this partioning to update the DFA to only contain one state for
+        // each partition.
+
+        // Create a map from DFA state ID to the representative ID of the
+        // equivalence class to which it belongs. The representative ID of an
+        // equivalence class of states is the minimum ID in that class.
+        let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
+        for p in &self.partitions {
+            p.iter(|id| state_to_part[id.to_usize()] = p.min());
+        }
+
+        // Generate a new contiguous sequence of IDs for minimal states, and
+        // create a map from equivalence IDs to the new IDs. Thus, the new
+        // minimal ID of *any* state in the unminimized DFA can be obtained
+        // with minimals_ids[state_to_part[old_id]].
+        let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
+        let mut new_id = S::from_usize(0);
+        for (id, _) in self.dfa.states() {
+            if state_to_part[id.to_usize()] == id {
+                minimal_ids[id.to_usize()] = new_id;
+                new_id = S::from_usize(new_id.to_usize() + 1);
+            }
+        }
+        // The total number of states in the minimal DFA.
+        let minimal_count = new_id.to_usize();
+
+        // Re-map this DFA in place such that the only states remaining
+        // correspond to the representative states of every equivalence class.
+        for id in (0..self.dfa.state_count()).map(S::from_usize) {
+            // If this state isn't a representative for an equivalence class,
+            // then we skip it since it won't appear in the minimal DFA.
+            if state_to_part[id.to_usize()] != id {
+                continue;
+            }
+            for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
+                *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
+            }
+            self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
+        }
+        // Trim off all unused states from the pre-minimized DFA. This
+        // represents all states that were merged into a non-singleton
+        // equivalence class of states, and appeared after the first state
+        // in each such class. (Because the state with the smallest ID in each
+        // equivalence class is its representative ID.)
+        self.dfa.truncate_states(minimal_count);
+
+        // Update the new start state, which is now just the minimal ID of
+        // whatever state the old start state was collapsed into.
+        let old_start = self.dfa.start_state();
+        self.dfa.set_start_state(
+            minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
+        );
+
+        // In order to update the ID of the maximum match state, we need to
+        // find the maximum ID among all of the match states in the minimized
+        // DFA. This is not necessarily the new ID of the unminimized maximum
+        // match state, since that could have been collapsed with a much
+        // earlier match state. Therefore, to find the new max match state,
+        // we iterate over all previous match states, find their corresponding
+        // new minimal ID, and take the maximum of those.
+        let old_max = self.dfa.max_match_state();
+        self.dfa.set_max_match_state(dead_id());
+        for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
+            let part = state_to_part[id.to_usize()];
+            let new_id = minimal_ids[part.to_usize()];
+            if new_id > self.dfa.max_match_state() {
+                self.dfa.set_max_match_state(new_id);
+            }
+        }
+    }
+
+    fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
+        self.waiting.iter().position(|s| s == set)
+    }
+
+    fn find_incoming_to(
+        &self,
+        b: u8,
+        set: &StateSet<S>,
+        incoming: &mut StateSet<S>,
+    ) {
+        incoming.clear();
+        set.iter(|id| {
+            for &inid in &self.in_transitions[id.to_usize()][b as usize] {
+                incoming.add(inid);
+            }
+        });
+        incoming.canonicalize();
+    }
+
+    fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
+        let mut is_match = StateSet::empty();
+        let mut no_match = StateSet::empty();
+        for (id, _) in dfa.states() {
+            if dfa.is_match_state(id) {
+                is_match.add(id);
+            } else {
+                no_match.add(id);
+            }
+        }
+
+        let mut sets = vec![is_match];
+        if !no_match.is_empty() {
+            sets.push(no_match);
+        }
+        sets.sort_by_key(|s| s.len());
+        sets
+    }
+
+    fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
+        let mut incoming = vec![];
+        for _ in dfa.states() {
+            incoming.push(vec![vec![]; dfa.alphabet_len()]);
+        }
+        for (id, state) in dfa.states() {
+            for (b, next) in state.transitions() {
+                incoming[next.to_usize()][b as usize].push(id);
+            }
+        }
+        incoming
+    }
+}
+
+impl<S: StateID> StateSet<S> {
+    fn empty() -> StateSet<S> {
+        StateSet(Rc::new(RefCell::new(vec![])))
+    }
+
+    fn add(&mut self, id: S) {
+        self.0.borrow_mut().push(id);
+    }
+
+    fn min(&self) -> S {
+        self.0.borrow()[0]
+    }
+
+    fn canonicalize(&mut self) {
+        self.0.borrow_mut().sort();
+        self.0.borrow_mut().dedup();
+    }
+
+    fn clear(&mut self) {
+        self.0.borrow_mut().clear();
+    }
+
+    fn len(&self) -> usize {
+        self.0.borrow().len()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    fn deep_clone(&self) -> StateSet<S> {
+        let ids = self.0.borrow().iter().cloned().collect();
+        StateSet(Rc::new(RefCell::new(ids)))
+    }
+
+    fn iter<F: FnMut(S)>(&self, mut f: F) {
+        for &id in self.0.borrow().iter() {
+            f(id);
+        }
+    }
+
+    fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
+        dest.clear();
+        if self.is_empty() || other.is_empty() {
+            return;
+        }
+
+        let (seta, setb) = (self.0.borrow(), other.0.borrow());
+        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+        loop {
+            if a == b {
+                dest.add(a);
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+                b = match itb.next() {
+                    None => break,
+                    Some(b) => b,
+                };
+            } else if a < b {
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+            } else {
+                b = match itb.next() {
+                    None => break,
+                    Some(b) => b,
+                };
+            }
+        }
+    }
+
+    fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
+        dest.clear();
+        if self.is_empty() || other.is_empty() {
+            self.iter(|s| dest.add(s));
+            return;
+        }
+
+        let (seta, setb) = (self.0.borrow(), other.0.borrow());
+        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+        loop {
+            if a == b {
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+                b = match itb.next() {
+                    None => {
+                        dest.add(a);
+                        break;
+                    }
+                    Some(b) => b,
+                };
+            } else if a < b {
+                dest.add(a);
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+            } else {
+                b = match itb.next() {
+                    None => {
+                        dest.add(a);
+                        break;
+                    }
+                    Some(b) => b,
+                };
+            }
+        }
+        for a in ita {
+            dest.add(a);
+        }
+    }
+}
diff --git a/src/nfa/compiler.rs b/src/nfa/compiler.rs
new file mode 100644
index 0000000..d9b3945
--- /dev/null
+++ b/src/nfa/compiler.rs
@@ -0,0 +1,1193 @@
+// This module provides an NFA compiler using Thompson's construction
+// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
+// graph as output. The NFA graph is structured in a way that permits it to be
+// executed by a virtual machine and also used to efficiently build a DFA.
+//
+// The compiler deals with a slightly expanded set of NFA states that notably
+// includes an empty node that has exactly one epsilon transition to the next
+// state. In other words, it's a "goto" instruction if one views Thompson's NFA
+// as a set of bytecode instructions. These goto instructions are removed in
+// a subsequent phase before returning the NFA to the caller. The purpose of
+// these empty nodes is that they make the construction algorithm substantially
+// simpler to implement. We remove them before returning to the caller because
+// they can represent substantial overhead when traversing the NFA graph
+// (either while searching using the NFA directly or while building a DFA).
+//
+// In the future, it would be nice to provide a Glushkov compiler as well,
+// as it would work well as a bit-parallel NFA for smaller regexes. But
+// the Thompson construction is one I'm more familiar with and seems more
+// straight-forward to deal with when it comes to large Unicode character
+// classes.
+//
+// Internally, the compiler uses interior mutability to improve composition
+// in the face of the borrow checker. In particular, we'd really like to be
+// able to write things like this:
+//
+//     self.c_concat(exprs.iter().map(|e| self.c(e)))
+//
+// Which elegantly uses iterators to build up a sequence of compiled regex
+// sub-expressions and then hands it off to the concatenating compiler
+// routine. Without interior mutability, the borrow checker won't let us
+// borrow `self` mutably both inside and outside the closure at the same
+// time.
+
+use std::cell::RefCell;
+use std::mem;
+
+use regex_syntax::hir::{self, Hir, HirKind};
+use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
+
+use classes::ByteClassSet;
+use error::{Error, Result};
+use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
+use nfa::range_trie::RangeTrie;
+use nfa::{State, StateID, Transition, NFA};
+
+/// Config knobs for the NFA compiler. See the builder's methods for more
+/// docs on each one.
+#[derive(Clone, Copy, Debug)]
+struct Config {
+    anchored: bool,
+    allow_invalid_utf8: bool,
+    reverse: bool,
+    shrink: bool,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Config {
+            anchored: false,
+            allow_invalid_utf8: false,
+            reverse: false,
+            shrink: true,
+        }
+    }
+}
+
+/// A builder for compiling an NFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+}
+
+impl Builder {
+    /// Create a new NFA builder with its default configuration.
+    pub fn new() -> Builder {
+        Builder { config: Config::default() }
+    }
+
+    /// Compile the given high level intermediate representation of a regular
+    /// expression into an NFA.
+    ///
+    /// If there was a problem building the NFA, then an error is returned.
+    /// For example, if the regex uses unsupported features (such as zero-width
+    /// assertions), then an error is returned.
+    pub fn build(&self, expr: &Hir) -> Result<NFA> {
+        let mut nfa = NFA::always_match();
+        self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
+        Ok(nfa)
+    }
+
+    /// Compile the given high level intermediate representation of a regular
+    /// expression into the NFA given using the given compiler. Callers may
+    /// prefer this over `build` if they would like to reuse allocations while
+    /// compiling many regular expressions.
+    ///
+    /// On success, the given NFA is completely overwritten with the NFA
+    /// produced by the compiler.
+    ///
+    /// If there was a problem building the NFA, then an error is returned. For
+    /// example, if the regex uses unsupported features (such as zero-width
+    /// assertions), then an error is returned. When an error is returned,
+    /// the contents of `nfa` are unspecified and should not be relied upon.
+    /// However, it can still be reused in subsequent calls to this method.
+    pub fn build_with(
+        &self,
+        compiler: &mut Compiler,
+        nfa: &mut NFA,
+        expr: &Hir,
+    ) -> Result<()> {
+        compiler.clear();
+        compiler.configure(self.config);
+        compiler.compile(nfa, expr)
+    }
+
+    /// Set whether matching must be anchored at the beginning of the input.
+    ///
+    /// When enabled, a match must begin at the start of the input. When
+    /// disabled, the NFA will act as if the pattern started with a `.*?`,
+    /// which enables a match to appear anywhere.
+    ///
+    /// By default this is disabled.
+    pub fn anchored(&mut self, yes: bool) -> &mut Builder {
+        self.config.anchored = yes;
+        self
+    }
+
+    /// When enabled, the builder will permit the construction of an NFA that
+    /// may match invalid UTF-8.
+    ///
+    /// When disabled (the default), the builder is guaranteed to produce a
+    /// regex that will only ever match valid UTF-8 (otherwise, the builder
+    /// will return an error).
+    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
+        self.config.allow_invalid_utf8 = yes;
+        self
+    }
+
+    /// Reverse the NFA.
+    ///
+    /// A NFA reversal is performed by reversing all of the concatenated
+    /// sub-expressions in the original pattern, recursively. The resulting
+    /// NFA can be used to match the pattern starting from the end of a string
+    /// instead of the beginning of a string.
+    ///
+    /// Reversing the NFA is useful for building a reverse DFA, which is most
+    /// useful for finding the start of a match.
+    pub fn reverse(&mut self, yes: bool) -> &mut Builder {
+        self.config.reverse = yes;
+        self
+    }
+
+    /// Apply best effort heuristics to shrink the NFA at the expense of more
+    /// time/memory.
+    ///
+    /// This is enabled by default. Generally speaking, if one is using an NFA
+    /// to compile DFA, then the extra time used to shrink the NFA will be
+    /// more than made up for during DFA construction (potentially by a lot).
+    /// In other words, enabling this can substantially decrease the overall
+    /// amount of time it takes to build a DFA.
+    ///
+    /// The only reason to disable this if you want to compile an NFA and start
+    /// using it as quickly as possible without needing to build a DFA.
+    pub fn shrink(&mut self, yes: bool) -> &mut Builder {
+        self.config.shrink = yes;
+        self
+    }
+}
+
+/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
+/// construction. Namely, this compiler permits epsilon transitions between
+/// states.
+///
+/// Users of this crate cannot use a compiler directly. Instead, all one can
+/// do is create one and use it via the
+/// [`Builder::build_with`](struct.Builder.html#method.build_with)
+/// method. This permits callers to reuse compilers in order to amortize
+/// allocations.
+#[derive(Clone, Debug)]
+pub struct Compiler {
+    /// The set of compiled NFA states. Once a state is compiled, it is
+    /// assigned a state ID equivalent to its index in this list. Subsequent
+    /// compilation can modify previous states by adding new transitions.
+    states: RefCell<Vec<CState>>,
+    /// The configuration from the builder.
+    config: Config,
+    /// State used for compiling character classes to UTF-8 byte automata.
+    /// State is not retained between character class compilations. This just
+    /// serves to amortize allocation to the extent possible.
+    utf8_state: RefCell<Utf8State>,
+    /// State used for arranging character classes in reverse into a trie.
+    trie_state: RefCell<RangeTrie>,
+    /// State used for caching common suffixes when compiling reverse UTF-8
+    /// automata (for Unicode character classes).
+    utf8_suffix: RefCell<Utf8SuffixMap>,
+    /// A map used to re-map state IDs when translating the compiler's internal
+    /// NFA state representation to the external NFA representation.
+    remap: RefCell<Vec<StateID>>,
+    /// A set of compiler internal state IDs that correspond to states that are
+    /// exclusively epsilon transitions, i.e., goto instructions, combined with
+    /// the state that they point to. This is used to record said states while
+    /// transforming the compiler's internal NFA representation to the external
+    /// form.
+    empties: RefCell<Vec<(StateID, StateID)>>,
+}
+
+/// A compiler intermediate state representation for an NFA that is only used
+/// during compilation. Once compilation is done, `CState`s are converted to
+/// `State`s, which have a much simpler representation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum CState {
+    /// An empty state whose only purpose is to forward the automaton to
+    /// another state via en epsilon transition. These are useful during
+    /// compilation but are otherwise removed at the end.
+    Empty { next: StateID },
+    /// A state that only transitions to `next` if the current input byte is
+    /// in the range `[start, end]` (inclusive on both ends).
+    Range { range: Transition },
+    /// A state with possibly many transitions, represented in a sparse
+    /// fashion. Transitions are ordered lexicographically by input range.
+    /// As such, this may only be used when every transition has equal
+    /// priority. (In practice, this is only used for encoding large UTF-8
+    /// automata.)
+    Sparse { ranges: Vec<Transition> },
+    /// An alternation such that there exists an epsilon transition to all
+    /// states in `alternates`, where matches found via earlier transitions
+    /// are preferred over later transitions.
+    Union { alternates: Vec<StateID> },
+    /// An alternation such that there exists an epsilon transition to all
+    /// states in `alternates`, where matches found via later transitions
+    /// are preferred over earlier transitions.
+    ///
+    /// This "reverse" state exists for convenience during compilation that
+    /// permits easy construction of non-greedy combinations of NFA states.
+    /// At the end of compilation, Union and UnionReverse states are merged
+    /// into one Union type of state, where the latter has its epsilon
+    /// transitions reversed to reflect the priority inversion.
+    UnionReverse { alternates: Vec<StateID> },
+    /// A match state. There is exactly one such occurrence of this state in
+    /// an NFA.
+    Match,
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub struct ThompsonRef {
+    start: StateID,
+    end: StateID,
+}
+
+impl Compiler {
+    /// Create a new compiler.
+    pub fn new() -> Compiler {
+        Compiler {
+            states: RefCell::new(vec![]),
+            config: Config::default(),
+            utf8_state: RefCell::new(Utf8State::new()),
+            trie_state: RefCell::new(RangeTrie::new()),
+            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+            remap: RefCell::new(vec![]),
+            empties: RefCell::new(vec![]),
+        }
+    }
+
+    /// Clear any memory used by this compiler such that it is ready to compile
+    /// a new regex.
+    ///
+    /// It is preferrable to reuse a compiler if possible in order to reuse
+    /// allocations.
+    fn clear(&self) {
+        self.states.borrow_mut().clear();
+        // We don't need to clear anything else since they are cleared on
+        // their own and only when they are used.
+    }
+
+    /// Configure this compiler from the builder's knobs.
+    ///
+    /// The compiler is always reconfigured by the builder before using it to
+    /// build an NFA.
+    fn configure(&mut self, config: Config) {
+        self.config = config;
+    }
+
+    /// Convert the current intermediate NFA to its final compiled form.
+    fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
+        nfa.anchored = self.config.anchored;
+
+        let mut start = self.add_empty();
+        if !nfa.anchored {
+            let compiled = if self.config.allow_invalid_utf8 {
+                self.c_unanchored_prefix_invalid_utf8()?
+            } else {
+                self.c_unanchored_prefix_valid_utf8()?
+            };
+            self.patch(start, compiled.start);
+            start = compiled.end;
+        }
+        let compiled = self.c(&expr)?;
+        let match_id = self.add_match();
+        self.patch(start, compiled.start);
+        self.patch(compiled.end, match_id);
+        self.finish(nfa);
+        Ok(())
+    }
+
+    /// Finishes the compilation process and populates the provide NFA with
+    /// the final graph.
+    fn finish(&self, nfa: &mut NFA) {
+        let mut bstates = self.states.borrow_mut();
+        let mut remap = self.remap.borrow_mut();
+        remap.resize(bstates.len(), 0);
+        let mut empties = self.empties.borrow_mut();
+        empties.clear();
+
+        // We don't reuse allocations here becuase this is what we're
+        // returning.
+        nfa.states.clear();
+        let mut byteset = ByteClassSet::new();
+
+        // The idea here is to convert our intermediate states to their final
+        // form. The only real complexity here is the process of converting
+        // transitions, which are expressed in terms of state IDs. The new
+        // set of states will be smaller because of partial epsilon removal,
+        // so the state IDs will not be the same.
+        for (id, bstate) in bstates.iter_mut().enumerate() {
+            match *bstate {
+                CState::Empty { next } => {
+                    // Since we're removing empty states, we need to handle
+                    // them later since we don't yet know which new state this
+                    // empty state will be mapped to.
+                    empties.push((id, next));
+                }
+                CState::Range { ref range } => {
+                    remap[id] = nfa.states.len();
+                    byteset.set_range(range.start, range.end);
+                    nfa.states.push(State::Range { range: range.clone() });
+                }
+                CState::Sparse { ref mut ranges } => {
+                    remap[id] = nfa.states.len();
+
+                    let ranges = mem::replace(ranges, vec![]);
+                    for r in &ranges {
+                        byteset.set_range(r.start, r.end);
+                    }
+                    nfa.states.push(State::Sparse {
+                        ranges: ranges.into_boxed_slice(),
+                    });
+                }
+                CState::Union { ref mut alternates } => {
+                    remap[id] = nfa.states.len();
+
+                    let alternates = mem::replace(alternates, vec![]);
+                    nfa.states.push(State::Union {
+                        alternates: alternates.into_boxed_slice(),
+                    });
+                }
+                CState::UnionReverse { ref mut alternates } => {
+                    remap[id] = nfa.states.len();
+
+                    let mut alternates = mem::replace(alternates, vec![]);
+                    alternates.reverse();
+                    nfa.states.push(State::Union {
+                        alternates: alternates.into_boxed_slice(),
+                    });
+                }
+                CState::Match => {
+                    remap[id] = nfa.states.len();
+                    nfa.states.push(State::Match);
+                }
+            }
+        }
+        for &(empty_id, mut empty_next) in empties.iter() {
+            // empty states can point to other empty states, forming a chain.
+            // So we must follow the chain until the end, which must end at
+            // a non-empty state, and therefore, a state that is correctly
+            // remapped. We are guaranteed to terminate because our compiler
+            // never builds a loop among empty states.
+            while let CState::Empty { next } = bstates[empty_next] {
+                empty_next = next;
+            }
+            remap[empty_id] = remap[empty_next];
+        }
+        for state in &mut nfa.states {
+            state.remap(&remap);
+        }
+        // The compiler always begins the NFA at the first state.
+        nfa.start = remap[0];
+        nfa.byte_classes = byteset.byte_classes();
+    }
+
+    fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
+        match *expr.kind() {
+            HirKind::Empty => {
+                let id = self.add_empty();
+                Ok(ThompsonRef { start: id, end: id })
+            }
+            HirKind::Literal(hir::Literal::Unicode(ch)) => {
+                let mut buf = [0; 4];
+                let it = ch
+                    .encode_utf8(&mut buf)
+                    .as_bytes()
+                    .iter()
+                    .map(|&b| Ok(self.c_range(b, b)));
+                self.c_concat(it)
+            }
+            HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
+            HirKind::Class(hir::Class::Bytes(ref cls)) => {
+                self.c_byte_class(cls)
+            }
+            HirKind::Class(hir::Class::Unicode(ref cls)) => {
+                self.c_unicode_class(cls)
+            }
+            HirKind::Repetition(ref rep) => self.c_repetition(rep),
+            HirKind::Group(ref group) => self.c(&*group.hir),
+            HirKind::Concat(ref exprs) => {
+                self.c_concat(exprs.iter().map(|e| self.c(e)))
+            }
+            HirKind::Alternation(ref exprs) => {
+                self.c_alternation(exprs.iter().map(|e| self.c(e)))
+            }
+            HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
+            HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
+        }
+    }
+
+    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
+    where
+        I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
+    {
+        let first =
+            if self.config.reverse { it.next_back() } else { it.next() };
+        let ThompsonRef { start, mut end } = match first {
+            Some(result) => result?,
+            None => return Ok(self.c_empty()),
+        };
+        loop {
+            let next =
+                if self.config.reverse { it.next_back() } else { it.next() };
+            let compiled = match next {
+                Some(result) => result?,
+                None => break,
+            };
+            self.patch(end, compiled.start);
+            end = compiled.end;
+        }
+        Ok(ThompsonRef { start, end })
+    }
+
+    fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
+    where
+        I: Iterator<Item = Result<ThompsonRef>>,
+    {
+        let first = it.next().expect("alternations must be non-empty")?;
+        let second = match it.next() {
+            None => return Ok(first),
+            Some(result) => result?,
+        };
+
+        let union = self.add_union();
+        let end = self.add_empty();
+        self.patch(union, first.start);
+        self.patch(first.end, end);
+        self.patch(union, second.start);
+        self.patch(second.end, end);
+        for result in it {
+            let compiled = result?;
+            self.patch(union, compiled.start);
+            self.patch(compiled.end, end);
+        }
+        Ok(ThompsonRef { start: union, end })
+    }
+
+    fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
+        match rep.kind {
+            hir::RepetitionKind::ZeroOrOne => {
+                self.c_zero_or_one(&rep.hir, rep.greedy)
+            }
+            hir::RepetitionKind::ZeroOrMore => {
+                self.c_at_least(&rep.hir, rep.greedy, 0)
+            }
+            hir::RepetitionKind::OneOrMore => {
+                self.c_at_least(&rep.hir, rep.greedy, 1)
+            }
+            hir::RepetitionKind::Range(ref rng) => match *rng {
+                hir::RepetitionRange::Exactly(count) => {
+                    self.c_exactly(&rep.hir, count)
+                }
+                hir::RepetitionRange::AtLeast(m) => {
+                    self.c_at_least(&rep.hir, rep.greedy, m)
+                }
+                hir::RepetitionRange::Bounded(min, max) => {
+                    self.c_bounded(&rep.hir, rep.greedy, min, max)
+                }
+            },
+        }
+    }
+
+    fn c_bounded(
+        &self,
+        expr: &Hir,
+        greedy: bool,
+        min: u32,
+        max: u32,
+    ) -> Result<ThompsonRef> {
+        let prefix = self.c_exactly(expr, min)?;
+        if min == max {
+            return Ok(prefix);
+        }
+
+        // It is tempting here to compile the rest here as a concatenation
+        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+        // were `aaa?a?a?`. The problem here is that it leads to this program:
+        //
+        //     >000000: 61 => 01
+        //     000001: 61 => 02
+        //     000002: alt(03, 04)
+        //     000003: 61 => 04
+        //     000004: alt(05, 06)
+        //     000005: 61 => 06
+        //     000006: alt(07, 08)
+        //     000007: 61 => 08
+        //     000008: MATCH
+        //
+        // And effectively, once you hit state 2, the epsilon closure will
+        // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
+        // better to instead compile it like so:
+        //
+        //     >000000: 61 => 01
+        //      000001: 61 => 02
+        //      000002: alt(03, 08)
+        //      000003: 61 => 04
+        //      000004: alt(05, 08)
+        //      000005: 61 => 06
+        //      000006: alt(07, 08)
+        //      000007: 61 => 08
+        //      000008: MATCH
+        //
+        // So that the epsilon closure of state 2 is now just 3 and 8.
+        let empty = self.add_empty();
+        let mut prev_end = prefix.end;
+        for _ in min..max {
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            };
+            let compiled = self.c(expr)?;
+            self.patch(prev_end, union);
+            self.patch(union, compiled.start);
+            self.patch(union, empty);
+            prev_end = compiled.end;
+        }
+        self.patch(prev_end, empty);
+        Ok(ThompsonRef { start: prefix.start, end: empty })
+    }
+
+    fn c_at_least(
+        &self,
+        expr: &Hir,
+        greedy: bool,
+        n: u32,
+    ) -> Result<ThompsonRef> {
+        if n == 0 {
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            };
+            let compiled = self.c(expr)?;
+            self.patch(union, compiled.start);
+            self.patch(compiled.end, union);
+            Ok(ThompsonRef { start: union, end: union })
+        } else if n == 1 {
+            let compiled = self.c(expr)?;
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            };
+            self.patch(compiled.end, union);
+            self.patch(union, compiled.start);
+            Ok(ThompsonRef { start: compiled.start, end: union })
+        } else {
+            let prefix = self.c_exactly(expr, n - 1)?;
+            let last = self.c(expr)?;
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            };
+            self.patch(prefix.end, last.start);
+            self.patch(last.end, union);
+            self.patch(union, last.start);
+            Ok(ThompsonRef { start: prefix.start, end: union })
+        }
+    }
+
+    fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
+        let union =
+            if greedy { self.add_union() } else { self.add_reverse_union() };
+        let compiled = self.c(expr)?;
+        let empty = self.add_empty();
+        self.patch(union, compiled.start);
+        self.patch(union, empty);
+        self.patch(compiled.end, empty);
+        Ok(ThompsonRef { start: union, end: empty })
+    }
+
+    fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
+        let it = (0..n).map(|_| self.c(expr));
+        self.c_concat(it)
+    }
+
+    fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
+        let end = self.add_empty();
+        let mut trans = Vec::with_capacity(cls.ranges().len());
+        for r in cls.iter() {
+            trans.push(Transition {
+                start: r.start(),
+                end: r.end(),
+                next: end,
+            });
+        }
+        Ok(ThompsonRef { start: self.add_sparse(trans), end })
+    }
+
+    fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
+        // If all we have are ASCII ranges wrapped in a Unicode package, then
+        // there is zero reason to bring out the big guns. We can fit all ASCII
+        // ranges within a single sparse transition.
+        if cls.is_all_ascii() {
+            let end = self.add_empty();
+            let mut trans = Vec::with_capacity(cls.ranges().len());
+            for r in cls.iter() {
+                assert!(r.start() <= '\x7F');
+                assert!(r.end() <= '\x7F');
+                trans.push(Transition {
+                    start: r.start() as u8,
+                    end: r.end() as u8,
+                    next: end,
+                });
+            }
+            Ok(ThompsonRef { start: self.add_sparse(trans), end })
+        } else if self.config.reverse {
+            if !self.config.shrink {
+                // When we don't want to spend the extra time shrinking, we
+                // compile the UTF-8 automaton in reverse using something like
+                // the "naive" approach, but will attempt to re-use common
+                // suffixes.
+                self.c_unicode_class_reverse_with_suffix(cls)
+            } else {
+                // When we want to shrink our NFA for reverse UTF-8 automata,
+                // we cannot feed UTF-8 sequences directly to the UTF-8
+                // compiler, since the UTF-8 compiler requires all sequences
+                // to be lexicographically sorted. Instead, we organize our
+                // sequences into a range trie, which can then output our
+                // sequences in the correct order. Unfortunately, building the
+                // range trie is fairly expensive (but not nearly as expensive
+                // as building a DFA). Hence the reason why the 'shrink' option
+                // exists, so that this path can be toggled off.
+                let mut trie = self.trie_state.borrow_mut();
+                trie.clear();
+
+                for rng in cls.iter() {
+                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+                        seq.reverse();
+                        trie.insert(seq.as_slice());
+                    }
+                }
+                let mut utf8_state = self.utf8_state.borrow_mut();
+                let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
+                trie.iter(|seq| {
+                    utf8c.add(&seq);
+                });
+                Ok(utf8c.finish())
+            }
+        } else {
+            // In the forward direction, we always shrink our UTF-8 automata
+            // because we can stream it right into the UTF-8 compiler. There
+            // is almost no downside (in either memory or time) to using this
+            // approach.
+            let mut utf8_state = self.utf8_state.borrow_mut();
+            let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
+            for rng in cls.iter() {
+                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+                    utf8c.add(seq.as_slice());
+                }
+            }
+            Ok(utf8c.finish())
+        }
+
+        // For reference, the code below is the "naive" version of compiling a
+        // UTF-8 automaton. It is deliciously simple (and works for both the
+        // forward and reverse cases), but will unfortunately produce very
+        // large NFAs. When compiling a forward automaton, the size difference
+        // can sometimes be an order of magnitude. For example, the '\w' regex
+        // will generate about ~3000 NFA states using the naive approach below,
+        // but only 283 states when using the approach above. This is because
+        // the approach above actually compiles a *minimal* (or near minimal,
+        // because of the bounded hashmap) UTF-8 automaton.
+        //
+        // The code below is kept as a reference point in order to make it
+        // easier to understand the higher level goal here.
+        /*
+        let it = cls
+            .iter()
+            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+            .map(|seq| {
+                let it = seq
+                    .as_slice()
+                    .iter()
+                    .map(|rng| Ok(self.c_range(rng.start, rng.end)));
+                self.c_concat(it)
+            });
+        self.c_alternation(it);
+        */
+    }
+
+    fn c_unicode_class_reverse_with_suffix(
+        &self,
+        cls: &hir::ClassUnicode,
+    ) -> Result<ThompsonRef> {
+        // N.B. It would likely be better to cache common *prefixes* in the
+        // reverse direction, but it's not quite clear how to do that. The
+        // advantage of caching suffixes is that it does give us a win, and
+        // has a very small additional overhead.
+        let mut cache = self.utf8_suffix.borrow_mut();
+        cache.clear();
+
+        let union = self.add_union();
+        let alt_end = self.add_empty();
+        for urng in cls.iter() {
+            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+                let mut end = alt_end;
+                for brng in seq.as_slice() {
+                    let key = Utf8SuffixKey {
+                        from: end,
+                        start: brng.start,
+                        end: brng.end,
+                    };
+                    let hash = cache.hash(&key);
+                    if let Some(id) = cache.get(&key, hash) {
+                        end = id;
+                        continue;
+                    }
+
+                    let compiled = self.c_range(brng.start, brng.end);
+                    self.patch(compiled.end, end);
+                    end = compiled.start;
+                    cache.set(key, hash, end);
+                }
+                self.patch(union, end);
+            }
+        }
+        Ok(ThompsonRef { start: union, end: alt_end })
+    }
+
+    fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
+        let id = self.add_range(start, end);
+        ThompsonRef { start: id, end: id }
+    }
+
+    fn c_empty(&self) -> ThompsonRef {
+        let id = self.add_empty();
+        ThompsonRef { start: id, end: id }
+    }
+
+    fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
+        self.c(&Hir::repetition(hir::Repetition {
+            kind: hir::RepetitionKind::ZeroOrMore,
+            greedy: false,
+            hir: Box::new(Hir::any(false)),
+        }))
+    }
+
+    fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
+        self.c(&Hir::repetition(hir::Repetition {
+            kind: hir::RepetitionKind::ZeroOrMore,
+            greedy: false,
+            hir: Box::new(Hir::any(true)),
+        }))
+    }
+
+    fn patch(&self, from: StateID, to: StateID) {
+        match self.states.borrow_mut()[from] {
+            CState::Empty { ref mut next } => {
+                *next = to;
+            }
+            CState::Range { ref mut range } => {
+                range.next = to;
+            }
+            CState::Sparse { .. } => {
+                panic!("cannot patch from a sparse NFA state")
+            }
+            CState::Union { ref mut alternates } => {
+                alternates.push(to);
+            }
+            CState::UnionReverse { ref mut alternates } => {
+                alternates.push(to);
+            }
+            CState::Match => {}
+        }
+    }
+
+    fn add_empty(&self) -> StateID {
+        let id = self.states.borrow().len();
+        self.states.borrow_mut().push(CState::Empty { next: 0 });
+        id
+    }
+
+    fn add_range(&self, start: u8, end: u8) -> StateID {
+        let id = self.states.borrow().len();
+        let trans = Transition { start, end, next: 0 };
+        let state = CState::Range { range: trans };
+        self.states.borrow_mut().push(state);
+        id
+    }
+
+    fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
+        if ranges.len() == 1 {
+            let id = self.states.borrow().len();
+            let state = CState::Range { range: ranges[0] };
+            self.states.borrow_mut().push(state);
+            return id;
+        }
+        let id = self.states.borrow().len();
+        let state = CState::Sparse { ranges };
+        self.states.borrow_mut().push(state);
+        id
+    }
+
+    fn add_union(&self) -> StateID {
+        let id = self.states.borrow().len();
+        let state = CState::Union { alternates: vec![] };
+        self.states.borrow_mut().push(state);
+        id
+    }
+
+    fn add_reverse_union(&self) -> StateID {
+        let id = self.states.borrow().len();
+        let state = CState::UnionReverse { alternates: vec![] };
+        self.states.borrow_mut().push(state);
+        id
+    }
+
+    fn add_match(&self) -> StateID {
+        let id = self.states.borrow().len();
+        self.states.borrow_mut().push(CState::Match);
+        id
+    }
+}
+
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+    nfac: &'a Compiler,
+    state: &'a mut Utf8State,
+    target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+    compiled: Utf8BoundedMap,
+    uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+    trans: Vec<Transition>,
+    last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+    start: u8,
+    end: u8,
+}
+
+impl Utf8State {
+    fn new() -> Utf8State {
+        Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
+    }
+
+    fn clear(&mut self) {
+        self.compiled.clear();
+        self.uncompiled.clear();
+    }
+}
+
+impl<'a> Utf8Compiler<'a> {
+    fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
+        let target = nfac.add_empty();
+        state.clear();
+        let mut utf8c = Utf8Compiler { nfac, state, target };
+        utf8c.add_empty();
+        utf8c
+    }
+
+    fn finish(&mut self) -> ThompsonRef {
+        self.compile_from(0);
+        let node = self.pop_root();
+        let start = self.compile(node);
+        ThompsonRef { start, end: self.target }
+    }
+
+    fn add(&mut self, ranges: &[Utf8Range]) {
+        let prefix_len = ranges
+            .iter()
+            .zip(&self.state.uncompiled)
+            .take_while(|&(range, node)| {
+                node.last.as_ref().map_or(false, |t| {
+                    (t.start, t.end) == (range.start, range.end)
+                })
+            })
+            .count();
+        assert!(prefix_len < ranges.len());
+        self.compile_from(prefix_len);
+        self.add_suffix(&ranges[prefix_len..]);
+    }
+
+    fn compile_from(&mut self, from: usize) {
+        let mut next = self.target;
+        while from + 1 < self.state.uncompiled.len() {
+            let node = self.pop_freeze(next);
+            next = self.compile(node);
+        }
+        self.top_last_freeze(next);
+    }
+
+    fn compile(&mut self, node: Vec<Transition>) -> StateID {
+        let hash = self.state.compiled.hash(&node);
+        if let Some(id) = self.state.compiled.get(&node, hash) {
+            return id;
+        }
+        let id = self.nfac.add_sparse(node.clone());
+        self.state.compiled.set(node, hash, id);
+        id
+    }
+
+    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+        assert!(!ranges.is_empty());
+        let last = self
+            .state
+            .uncompiled
+            .len()
+            .checked_sub(1)
+            .expect("non-empty nodes");
+        assert!(self.state.uncompiled[last].last.is_none());
+        self.state.uncompiled[last].last = Some(Utf8LastTransition {
+            start: ranges[0].start,
+            end: ranges[0].end,
+        });
+        for r in &ranges[1..] {
+            self.state.uncompiled.push(Utf8Node {
+                trans: vec![],
+                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+            });
+        }
+    }
+
+    fn add_empty(&mut self) {
+        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+    }
+
+    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+        let mut uncompiled = self.state.uncompiled.pop().unwrap();
+        uncompiled.set_last_transition(next);
+        uncompiled.trans
+    }
+
+    fn pop_root(&mut self) -> Vec<Transition> {
+        assert_eq!(self.state.uncompiled.len(), 1);
+        assert!(self.state.uncompiled[0].last.is_none());
+        self.state.uncompiled.pop().expect("non-empty nodes").trans
+    }
+
+    fn top_last_freeze(&mut self, next: StateID) {
+        let last = self
+            .state
+            .uncompiled
+            .len()
+            .checked_sub(1)
+            .expect("non-empty nodes");
+        self.state.uncompiled[last].set_last_transition(next);
+    }
+}
+
+impl Utf8Node {
+    fn set_last_transition(&mut self, next: StateID) {
+        if let Some(last) = self.last.take() {
+            self.trans.push(Transition {
+                start: last.start,
+                end: last.end,
+                next,
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use regex_syntax::hir::Hir;
+    use regex_syntax::ParserBuilder;
+
+    use super::{Builder, State, StateID, Transition, NFA};
+
+    fn parse(pattern: &str) -> Hir {
+        ParserBuilder::new().build().parse(pattern).unwrap()
+    }
+
+    fn build(pattern: &str) -> NFA {
+        Builder::new().anchored(true).build(&parse(pattern)).unwrap()
+    }
+
+    fn s_byte(byte: u8, next: StateID) -> State {
+        let trans = Transition { start: byte, end: byte, next };
+        State::Range { range: trans }
+    }
+
+    fn s_range(start: u8, end: u8, next: StateID) -> State {
+        let trans = Transition { start, end, next };
+        State::Range { range: trans }
+    }
+
+    fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
+        let ranges = ranges
+            .iter()
+            .map(|&(start, end, next)| Transition { start, end, next })
+            .collect();
+        State::Sparse { ranges }
+    }
+
+    fn s_union(alts: &[StateID]) -> State {
+        State::Union { alternates: alts.to_vec().into_boxed_slice() }
+    }
+
+    fn s_match() -> State {
+        State::Match
+    }
+
+    #[test]
+    fn errors() {
+        // unsupported anchors
+        assert!(Builder::new().build(&parse(r"^")).is_err());
+        assert!(Builder::new().build(&parse(r"$")).is_err());
+        assert!(Builder::new().build(&parse(r"\A")).is_err());
+        assert!(Builder::new().build(&parse(r"\z")).is_err());
+
+        // unsupported word boundaries
+        assert!(Builder::new().build(&parse(r"\b")).is_err());
+        assert!(Builder::new().build(&parse(r"\B")).is_err());
+        assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
+    }
+
+    // Test that building an unanchored NFA has an appropriate `.*?` prefix.
+    #[test]
+    fn compile_unanchored_prefix() {
+        // When the machine can only match valid UTF-8.
+        let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
+        // There should be many states since the `.` in `.*?` matches any
+        // Unicode scalar value.
+        assert_eq!(11, nfa.len());
+        assert_eq!(nfa.states[10], s_match());
+        assert_eq!(nfa.states[9], s_byte(b'a', 10));
+
+        // When the machine can match invalid UTF-8.
+        let nfa = Builder::new()
+            .anchored(false)
+            .allow_invalid_utf8(true)
+            .build(&parse(r"a"))
+            .unwrap();
+        assert_eq!(
+            nfa.states,
+            &[
+                s_union(&[2, 1]),
+                s_range(0, 255, 0),
+                s_byte(b'a', 3),
+                s_match(),
+            ]
+        );
+    }
+
+    #[test]
+    fn compile_empty() {
+        assert_eq!(build("").states, &[s_match(),]);
+    }
+
+    #[test]
+    fn compile_literal() {
+        assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
+        assert_eq!(
+            build("ab").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
+        );
+        assert_eq!(
+            build("☃").states,
+            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
+        );
+
+        // Check that non-UTF-8 literals work.
+        let hir = ParserBuilder::new()
+            .allow_invalid_utf8(true)
+            .build()
+            .parse(r"(?-u)\xFF")
+            .unwrap();
+        let nfa = Builder::new()
+            .anchored(true)
+            .allow_invalid_utf8(true)
+            .build(&hir)
+            .unwrap();
+        assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
+    }
+
+    #[test]
+    fn compile_class() {
+        assert_eq!(
+            build(r"[a-z]").states,
+            &[s_range(b'a', b'z', 1), s_match(),]
+        );
+        assert_eq!(
+            build(r"[x-za-c]").states,
+            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
+        );
+        assert_eq!(
+            build(r"[\u03B1-\u03B4]").states,
+            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
+        );
+        assert_eq!(
+            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+            &[
+                s_range(0xB1, 0xB4, 5),
+                s_range(0x99, 0x9E, 5),
+                s_byte(0xA4, 1),
+                s_byte(0x9F, 2),
+                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+                s_match(),
+            ]
+        );
+        assert_eq!(
+            build(r"[a-z☃]").states,
+            &[
+                s_byte(0x83, 3),
+                s_byte(0x98, 0),
+                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+                s_match(),
+            ]
+        );
+    }
+
+    #[test]
+    fn compile_repetition() {
+        assert_eq!(
+            build(r"a?").states,
+            &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
+        );
+        assert_eq!(
+            build(r"a??").states,
+            &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
+        );
+    }
+
+    #[test]
+    fn compile_group() {
+        assert_eq!(
+            build(r"ab+").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
+        );
+        assert_eq!(
+            build(r"(ab)").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
+        );
+        assert_eq!(
+            build(r"(ab)+").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
+        );
+    }
+
+    #[test]
+    fn compile_alternation() {
+        assert_eq!(
+            build(r"a|b").states,
+            &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
+        );
+        assert_eq!(
+            build(r"|b").states,
+            &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
+        );
+        assert_eq!(
+            build(r"a|").states,
+            &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
+        );
+    }
+}
diff --git a/src/nfa/map.rs b/src/nfa/map.rs
new file mode 100644
index 0000000..e636c0d
--- /dev/null
+++ b/src/nfa/map.rs
@@ -0,0 +1,282 @@
+// This module contains a couple simple and purpose built hash maps. The key
+// trade off they make is that they serve as caches rather than true maps. That
+// is, inserting a new entry may cause eviction of another entry. This gives
+// us two things. First, there's less overhead associated with inserts and
+// lookups. Secondly, it lets us control our memory usage.
+//
+// These maps are used in some fairly hot code when generating NFA states for
+// large Unicode character classes.
+//
+// Instead of exposing a rich hashmap entry API, we just permit the caller
+// to produce a hash of the key directly. The hash can then be reused for both
+// lookups and insertions at the cost of leaking things a bit. But these are
+// for internal use only, so it's fine.
+//
+// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
+// (almost) minimal DFA for large Unicode character classes in linear time.
+// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
+// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
+// since there's a bit more expense in the reverse direction.)
+//
+// The Utf8SuffixMap is used when compiling large Unicode character classes for
+// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
+// construction of UTF-8 automata by caching common suffixes. This doesn't
+// get the same space savings as Daciuk's algorithm, but it's basically as
+// fast as the naive approach and typically winds up using less memory (since
+// it generates smaller NFAs) despite the presence of the cache.
+//
+// These maps effectively represent caching mechanisms for CState::Sparse and
+// CState::Range, respectively. The former represents a single NFA state with
+// many transitions of equivalent priority while the latter represents a single
+// NFA state with a single transition. (Neither state ever has or is an
+// epsilon transition.) Thus, they have different key types. It's likely we
+// could make one generic map, but the machinery didn't seem worth it. They
+// are simple enough.
+
+use nfa::{StateID, Transition};
+
+// Basic FNV-1a hash constants as described in:
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+const PRIME: u64 = 1099511628211;
+const INIT: u64 = 14695981039346656037;
+
+/// A bounded hash map where the key is a sequence of NFA transitions and the
+/// value is a pre-existing NFA state ID.
+///
+/// std's hashmap can be used for this, however, this map has two important
+/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
+/// control our memory usage by limited the number of slots. In general, the
+/// cost here is that this map acts as a cache. That is, inserting a new entry
+/// may remove an old entry. We are okay with this, since it does not impact
+/// correctness in the cases where it is used. The only effect that dropping
+/// states from the cache has is that the resulting NFA generated may be bigger
+/// than it otherwise would be.
+///
+/// This improves benchmarks that compile large Unicode character classes,
+/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
+/// Specifically, one could observe the difference with std's hashmap via
+/// something like the following benchmark:
+///
+///   hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+///
+/// But to observe that difference, you'd have to modify the code to use
+/// std's hashmap.
+///
+/// It is quite possible that there is a better way to approach this problem.
+/// For example, if there happens to be a very common state that collides with
+/// a lot of less frequent states, then we could wind up with very poor caching
+/// behavior. Alas, the effectiveness of this cache has not been measured.
+/// Instead, ad hoc experiments suggest that it is "good enough." Additional
+/// smarts (such as an LRU eviction policy) have to be weighed against the
+/// amount of extra time they cost.
+#[derive(Clone, Debug)]
+pub struct Utf8BoundedMap {
+    /// The current version of this map. Only entries with matching versions
+    /// are considered during lookups. If an entry is found with a mismatched
+    /// version, then the map behaves as if the entry does not exist.
+    version: u16,
+    /// The total number of entries this map can store.
+    capacity: usize,
+    /// The actual entries, keyed by hash. Collisions between different states
+    /// result in the old state being dropped.
+    map: Vec<Utf8BoundedEntry>,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8BoundedEntry {
+    /// The version of the map used to produce this entry. If this entry's
+    /// version does not match the current version of the map, then the map
+    /// should behave as if this entry does not exist.
+    version: u16,
+    /// The key, which is a sorted sequence of non-overlapping NFA transitions.
+    key: Vec<Transition>,
+    /// The state ID corresponding to the state containing the transitions in
+    /// this entry.
+    val: StateID,
+}
+
+impl Utf8BoundedMap {
+    /// Create a new bounded map with the given capacity. The map will never
+    /// grow beyond the given size.
+    ///
+    /// Note that this does not allocate. Instead, callers must call `clear`
+    /// before using this map. `clear` will allocate space if necessary.
+    ///
+    /// This avoids the need to pay for the allocation of this map when
+    /// compiling regexes that lack large Unicode character classes.
+    pub fn new(capacity: usize) -> Utf8BoundedMap {
+        assert!(capacity > 0);
+        Utf8BoundedMap { version: 0, capacity, map: vec![] }
+    }
+
+    /// Clear this map of all entries, but permit the reuse of allocation
+    /// if possible.
+    ///
+    /// This must be called before the map can be used.
+    pub fn clear(&mut self) {
+        if self.map.is_empty() {
+            self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+        } else {
+            self.version = self.version.wrapping_add(1);
+            if self.version == 0 {
+                self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+            }
+        }
+    }
+
+    /// Return a hash of the given transitions.
+    pub fn hash(&self, key: &[Transition]) -> usize {
+        let mut h = INIT;
+        for t in key {
+            h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
+            h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
+            h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
+        }
+        (h as usize) % self.map.len()
+    }
+
+    /// Retrieve the cached state ID corresponding to the given key. The hash
+    /// given must have been computed with `hash` using the same key value.
+    ///
+    /// If there is no cached state with the given transitions, then None is
+    /// returned.
+    pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
+        let entry = &self.map[hash];
+        if entry.version != self.version {
+            return None;
+        }
+        // There may be a hash collision, so we need to confirm real equality.
+        if entry.key != key {
+            return None;
+        }
+        Some(entry.val)
+    }
+
+    /// Add a cached state to this map with the given key. Callers should
+    /// ensure that `state_id` points to a state that contains precisely the
+    /// NFA transitions given.
+    ///
+    /// `hash` must have been computed using the `hash` method with the same
+    /// key.
+    pub fn set(
+        &mut self,
+        key: Vec<Transition>,
+        hash: usize,
+        state_id: StateID,
+    ) {
+        self.map[hash] =
+            Utf8BoundedEntry { version: self.version, key, val: state_id };
+    }
+}
+
+/// A cache of suffixes used to modestly compress UTF-8 automata for large
+/// Unicode character classes.
+#[derive(Clone, Debug)]
+pub struct Utf8SuffixMap {
+    /// The current version of this map. Only entries with matching versions
+    /// are considered during lookups. If an entry is found with a mismatched
+    /// version, then the map behaves as if the entry does not exist.
+    version: u16,
+    /// The total number of entries this map can store.
+    capacity: usize,
+    /// The actual entries, keyed by hash. Collisions between different states
+    /// result in the old state being dropped.
+    map: Vec<Utf8SuffixEntry>,
+}
+
+/// A key that uniquely identifies an NFA state. It is a triple that represents
+/// a transition from one state for a particular byte range.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Utf8SuffixKey {
+    pub from: StateID,
+    pub start: u8,
+    pub end: u8,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8SuffixEntry {
+    /// The version of the map used to produce this entry. If this entry's
+    /// version does not match the current version of the map, then the map
+    /// should behave as if this entry does not exist.
+    version: u16,
+    /// The key, which consists of a transition in a particular state.
+    key: Utf8SuffixKey,
+    /// The identifier that the transition in the key maps to.
+    val: StateID,
+}
+
+impl Utf8SuffixMap {
+    /// Create a new bounded map with the given capacity. The map will never
+    /// grow beyond the given size.
+    ///
+    /// Note that this does not allocate. Instead, callers must call `clear`
+    /// before using this map. `clear` will allocate space if necessary.
+    ///
+    /// This avoids the need to pay for the allocation of this map when
+    /// compiling regexes that lack large Unicode character classes.
+    pub fn new(capacity: usize) -> Utf8SuffixMap {
+        assert!(capacity > 0);
+        Utf8SuffixMap { version: 0, capacity, map: vec![] }
+    }
+
+    /// Clear this map of all entries, but permit the reuse of allocation
+    /// if possible.
+    ///
+    /// This must be called before the map can be used.
+    pub fn clear(&mut self) {
+        if self.map.is_empty() {
+            self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+        } else {
+            self.version = self.version.wrapping_add(1);
+            if self.version == 0 {
+                self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+            }
+        }
+    }
+
+    /// Return a hash of the given transition.
+    pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
+        // Basic FNV-1a hash as described:
+        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+        const PRIME: u64 = 1099511628211;
+        const INIT: u64 = 14695981039346656037;
+
+        let mut h = INIT;
+        h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
+        h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
+        h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
+        (h as usize) % self.map.len()
+    }
+
+    /// Retrieve the cached state ID corresponding to the given key. The hash
+    /// given must have been computed with `hash` using the same key value.
+    ///
+    /// If there is no cached state with the given key, then None is returned.
+    pub fn get(
+        &mut self,
+        key: &Utf8SuffixKey,
+        hash: usize,
+    ) -> Option<StateID> {
+        let entry = &self.map[hash];
+        if entry.version != self.version {
+            return None;
+        }
+        if key != &entry.key {
+            return None;
+        }
+        Some(entry.val)
+    }
+
+    /// Add a cached state to this map with the given key. Callers should
+    /// ensure that `state_id` points to a state that contains precisely the
+    /// NFA transition given.
+    ///
+    /// `hash` must have been computed using the `hash` method with the same
+    /// key.
+    pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
+        self.map[hash] =
+            Utf8SuffixEntry { version: self.version, key, val: state_id };
+    }
+}
diff --git a/src/nfa/mod.rs b/src/nfa/mod.rs
new file mode 100644
index 0000000..02d0501
--- /dev/null
+++ b/src/nfa/mod.rs
@@ -0,0 +1,252 @@
+use std::fmt;
+
+use classes::ByteClasses;
+pub use nfa::compiler::Builder;
+
+mod compiler;
+mod map;
+mod range_trie;
+
+/// The representation for an NFA state identifier.
+pub type StateID = usize;
+
+/// A final compiled NFA.
+///
+/// The states of the NFA are indexed by state IDs, which are how transitions
+/// are expressed.
+#[derive(Clone)]
+pub struct NFA {
+    /// Whether this NFA can only match at the beginning of input or not.
+    ///
+    /// When true, a match should only be reported if it begins at the 0th
+    /// index of the haystack.
+    anchored: bool,
+    /// The starting state of this NFA.
+    start: StateID,
+    /// The state list. This list is guaranteed to be indexable by the starting
+    /// state ID, and it is also guaranteed to contain exactly one `Match`
+    /// state.
+    states: Vec<State>,
+    /// A mapping from any byte value to its corresponding equivalence class
+    /// identifier. Two bytes in the same equivalence class cannot discriminate
+    /// between a match or a non-match. This map can be used to shrink the
+    /// total size of a DFA's transition table with a small match-time cost.
+    ///
+    /// Note that the NFA's transitions are *not* defined in terms of these
+    /// equivalence classes. The NFA's transitions are defined on the original
+    /// byte values. For the most part, this is because they wouldn't really
+    /// help the NFA much since the NFA already uses a sparse representation
+    /// to represent transitions. Byte classes are most effective in a dense
+    /// representation.
+    byte_classes: ByteClasses,
+}
+
+impl NFA {
+    /// Returns an NFA that always matches at every position.
+    pub fn always_match() -> NFA {
+        NFA {
+            anchored: false,
+            start: 0,
+            states: vec![State::Match],
+            byte_classes: ByteClasses::empty(),
+        }
+    }
+
+    /// Returns an NFA that never matches at any position.
+    pub fn never_match() -> NFA {
+        NFA {
+            anchored: false,
+            start: 0,
+            states: vec![State::Fail],
+            byte_classes: ByteClasses::empty(),
+        }
+    }
+
+    /// Returns true if and only if this NFA is anchored.
+    pub fn is_anchored(&self) -> bool {
+        self.anchored
+    }
+
+    /// Return the number of states in this NFA.
+    pub fn len(&self) -> usize {
+        self.states.len()
+    }
+
+    /// Return the ID of the initial state of this NFA.
+    pub fn start(&self) -> StateID {
+        self.start
+    }
+
+    /// Return the NFA state corresponding to the given ID.
+    pub fn state(&self, id: StateID) -> &State {
+        &self.states[id]
+    }
+
+    /// Return the set of equivalence classes for this NFA. The slice returned
+    /// always has length 256 and maps each possible byte value to its
+    /// corresponding equivalence class ID (which is never more than 255).
+    pub fn byte_classes(&self) -> &ByteClasses {
+        &self.byte_classes
+    }
+}
+
+impl fmt::Debug for NFA {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        for (i, state) in self.states.iter().enumerate() {
+            let status = if i == self.start { '>' } else { ' ' };
+            writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+        }
+        Ok(())
+    }
+}
+
+/// A state in a final compiled NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+    /// A state that transitions to `next` if and only if the current input
+    /// byte is in the range `[start, end]` (inclusive).
+    ///
+    /// This is a special case of Sparse in that it encodes only one transition
+    /// (and therefore avoids the allocation).
+    Range { range: Transition },
+    /// A state with possibly many transitions, represented in a sparse
+    /// fashion. Transitions are ordered lexicographically by input range.
+    /// As such, this may only be used when every transition has equal
+    /// priority. (In practice, this is only used for encoding large UTF-8
+    /// automata.)
+    Sparse { ranges: Box<[Transition]> },
+    /// An alternation such that there exists an epsilon transition to all
+    /// states in `alternates`, where matches found via earlier transitions
+    /// are preferred over later transitions.
+    Union { alternates: Box<[StateID]> },
+    /// A fail state. When encountered, the automaton is guaranteed to never
+    /// reach a match state.
+    Fail,
+    /// A match state. There is exactly one such occurrence of this state in
+    /// an NFA.
+    Match,
+}
+
+/// A transition to another state, only if the given byte falls in the
+/// inclusive range specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+    pub start: u8,
+    pub end: u8,
+    pub next: StateID,
+}
+
+impl State {
+    /// Returns true if and only if this state contains one or more epsilon
+    /// transitions.
+    pub fn is_epsilon(&self) -> bool {
+        match *self {
+            State::Range { .. }
+            | State::Sparse { .. }
+            | State::Fail
+            | State::Match => false,
+            State::Union { .. } => true,
+        }
+    }
+
+    /// Remap the transitions in this state using the given map. Namely, the
+    /// given map should be indexed according to the transitions currently
+    /// in this state.
+    ///
+    /// This is used during the final phase of the NFA compiler, which turns
+    /// its intermediate NFA into the final NFA.
+    fn remap(&mut self, remap: &[StateID]) {
+        match *self {
+            State::Range { ref mut range } => range.next = remap[range.next],
+            State::Sparse { ref mut ranges } => {
+                for r in ranges.iter_mut() {
+                    r.next = remap[r.next];
+                }
+            }
+            State::Union { ref mut alternates } => {
+                for alt in alternates.iter_mut() {
+                    *alt = remap[*alt];
+                }
+            }
+            State::Fail => {}
+            State::Match => {}
+        }
+    }
+}
+
+impl fmt::Debug for State {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            State::Range { ref range } => range.fmt(f),
+            State::Sparse { ref ranges } => {
+                let rs = ranges
+                    .iter()
+                    .map(|t| format!("{:?}", t))
+                    .collect::<Vec<String>>()
+                    .join(", ");
+                write!(f, "sparse({})", rs)
+            }
+            State::Union { ref alternates } => {
+                let alts = alternates
+                    .iter()
+                    .map(|id| format!("{}", id))
+                    .collect::<Vec<String>>()
+                    .join(", ");
+                write!(f, "alt({})", alts)
+            }
+            State::Fail => write!(f, "FAIL"),
+            State::Match => write!(f, "MATCH"),
+        }
+    }
+}
+
+impl fmt::Debug for Transition {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let Transition { start, end, next } = *self;
+        if self.start == self.end {
+            write!(f, "{} => {}", escape(start), next)
+        } else {
+            write!(f, "{}-{} => {}", escape(start), escape(end), next)
+        }
+    }
+}
+
+/// Return the given byte as its escaped string form.
+fn escape(b: u8) -> String {
+    use std::ascii;
+
+    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use dense;
+    use dfa::DFA;
+
+    #[test]
+    fn always_match() {
+        let nfa = NFA::always_match();
+        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+        assert_eq!(Some(0), dfa.find_at(b"", 0));
+        assert_eq!(Some(0), dfa.find_at(b"a", 0));
+        assert_eq!(Some(1), dfa.find_at(b"a", 1));
+        assert_eq!(Some(0), dfa.find_at(b"ab", 0));
+        assert_eq!(Some(1), dfa.find_at(b"ab", 1));
+        assert_eq!(Some(2), dfa.find_at(b"ab", 2));
+    }
+
+    #[test]
+    fn never_match() {
+        let nfa = NFA::never_match();
+        let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
+
+        assert_eq!(None, dfa.find_at(b"", 0));
+        assert_eq!(None, dfa.find_at(b"a", 0));
+        assert_eq!(None, dfa.find_at(b"a", 1));
+        assert_eq!(None, dfa.find_at(b"ab", 0));
+        assert_eq!(None, dfa.find_at(b"ab", 1));
+        assert_eq!(None, dfa.find_at(b"ab", 2));
+    }
+}
diff --git a/src/nfa/range_trie.rs b/src/nfa/range_trie.rs
new file mode 100644
index 0000000..50767c7
--- /dev/null
+++ b/src/nfa/range_trie.rs
@@ -0,0 +1,1048 @@
+// I've called the primary data structure in this module a "range trie." As far
+// as I can tell, there is no prior art on a data structure like this, however,
+// it's likely someone somewhere has built something like it. Searching for
+// "range trie" turns up the paper "Range Tries for Scalable Address Lookup,"
+// but it does not appear relevant.
+//
+// The range trie is just like a trie in that it is a special case of a
+// deterministic finite state machine. It has states and each state has a set
+// of transitions to other states. It is acyclic, and, like a normal trie,
+// it makes no attempt to reuse common suffixes among its elements. The key
+// difference between a normal trie and a range trie below is that a range trie
+// operates on *contiguous sequences* of bytes instead of singleton bytes.
+// One could say say that our alphabet is ranges of bytes instead of bytes
+// themselves, except a key part of range trie construction is splitting ranges
+// apart to ensure there is at most one transition that can be taken for any
+// byte in a given state.
+//
+// I've tried to explain the details of how the range trie works below, so
+// for now, we are left with trying to understand what problem we're trying to
+// solve. Which is itself fairly involved!
+//
+// At the highest level, here's what we want to do. We want to convert a
+// sequence of Unicode codepoints into a finite state machine whose transitions
+// are over *bytes* and *not* Unicode codepoints. We want this because it makes
+// said finite state machines much smaller and much faster to execute. As a
+// simple example, consider a byte oriented automaton for all Unicode scalar
+// values (0x00 through 0x10FFFF, not including surrogate codepoints):
+//
+//     [00-7F]
+//     [C2-DF][80-BF]
+//     [E0-E0][A0-BF][80-BF]
+//     [E1-EC][80-BF][80-BF]
+//     [ED-ED][80-9F][80-BF]
+//     [EE-EF][80-BF][80-BF]
+//     [F0-F0][90-BF][80-BF][80-BF]
+//     [F1-F3][80-BF][80-BF][80-BF]
+//     [F4-F4][80-8F][80-BF][80-BF]
+//
+// (These byte ranges are generated via the regex-syntax::utf8 module, which
+// was based on Russ Cox's code in RE2, which was in turn based on Ken
+// Thompson's implementation of the same idea in his Plan9 implementation of
+// grep.)
+//
+// It should be fairly straight-forward to see how one could compile this into
+// a DFA. The sequences are sorted and non-overlapping. Essentially, you could
+// build a trie from this fairly easy. The problem comes when your initial
+// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class
+// represented by '\w' contains only a tenth of the codepoints that
+// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges
+// as we did above, the list would stretch to 892 entries! This turns into
+// quite a large NFA with a few thousand states. Turning this beast into a DFA
+// takes quite a bit of time. We are thus left with trying to trim down the
+// number of states we produce as early as possible.
+//
+// One approach (used by RE2 and still by the regex crate, at time of writing)
+// is to try to find common suffixes while building NFA states for the above
+// and reuse them. This is very cheap to do and one can control precisely how
+// much extra memory you want to use for the cache.
+//
+// Another approach, however, is to reuse an algorithm for constructing a
+// *minimal* DFA from a sorted sequence of inputs. I don't want to go into
+// the full details here, but I explain it in more depth in my blog post on
+// FSTs[1]. Note that the algorithm not invented by me, but was published
+// in paper by Daciuk et al. in 2000 called "Incremental Construction of
+// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
+// it is also possible to control the amount of extra memory one uses, although
+// this usually comes with the cost of sacrificing true minimality. (But it's
+// typically close enough with a reasonably sized cache of states.)
+//
+// The catch is that Daciuk's algorithm only works if you add your keys in
+// lexicographic ascending order. In our case, since we're dealing with ranges,
+// we also need the additional requirement that ranges are either equivalent
+// or do not overlap at all. For example, if one were given the following byte
+// ranges:
+//
+//     [BC-BF][80-BF]
+//     [BC-BF][90-BF]
+//
+// Then Daciuk's algorithm also would not work, since there is nothing to
+// handle the fact that the ranges overlap. They would need to be split apart.
+// Thankfully, Thompson's algorithm for producing byte ranges for Unicode
+// codepoint ranges meets both of our requirements.
+//
+// ... however, we would also like to be able to compile UTF-8 automata in
+// reverse. We want this because in order to find the starting location of a
+// match using a DFA, we need to run a second DFA---a reversed version of the
+// forward DFA---backwards to discover the match location. Unfortunately, if
+// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are
+// can overlap, even if they are sorted:
+//
+//     [00-7F]
+//     [80-BF][80-9F][ED-ED]
+//     [80-BF][80-BF][80-8F][F4-F4]
+//     [80-BF][80-BF][80-BF][F1-F3]
+//     [80-BF][80-BF][90-BF][F0-F0]
+//     [80-BF][80-BF][E1-EC]
+//     [80-BF][80-BF][EE-EF]
+//     [80-BF][A0-BF][E0-E0]
+//     [80-BF][C2-DF]
+//
+// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have
+// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no
+// simple way to apply Daciuk's algorithm.
+//
+// And thus, the range trie was born. The range trie's only purpose is to take
+// sequences of byte ranges like the ones above, collect them into a trie and
+// then spit them in a sorted fashion with no overlapping ranges. For example,
+// 0x00-0x10FFFF gets translated to:
+//
+//     [0-7F]
+//     [80-BF][80-9F][80-8F][F1-F3]
+//     [80-BF][80-9F][80-8F][F4]
+//     [80-BF][80-9F][90-BF][F0]
+//     [80-BF][80-9F][90-BF][F1-F3]
+//     [80-BF][80-9F][E1-EC]
+//     [80-BF][80-9F][ED]
+//     [80-BF][80-9F][EE-EF]
+//     [80-BF][A0-BF][80-8F][F1-F3]
+//     [80-BF][A0-BF][80-8F][F4]
+//     [80-BF][A0-BF][90-BF][F0]
+//     [80-BF][A0-BF][90-BF][F1-F3]
+//     [80-BF][A0-BF][E0]
+//     [80-BF][A0-BF][E1-EC]
+//     [80-BF][A0-BF][EE-EF]
+//     [80-BF][C2-DF]
+//
+// We've thus satisfied our requirements for running Daciuk's algorithm. All
+// sequences of ranges are sorted, and any corresponding ranges are either
+// exactly equivalent or non-overlapping.
+//
+// In effect, a range trie is building a DFA from a sequence of arbitrary
+// byte ranges. But it uses an algoritm custom tailored to its input, so it
+// is not as costly as traditional DFA construction. While it is still quite
+// a bit more costly than the forward's case (which only needs Daciuk's
+// algorithm), it winds up saving a substantial amount of time if one is doing
+// a full DFA powerset construction later by virtue of producing a much much
+// smaller NFA.
+//
+// [1] - https://blog.burntsushi.net/transducers/
+// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
+
+use std::cell::RefCell;
+use std::fmt;
+use std::mem;
+use std::ops::RangeInclusive;
+use std::u32;
+
+use regex_syntax::utf8::Utf8Range;
+
+/// A smaller state ID means more effective use of the CPU cache and less
+/// time spent copying. The implementation below will panic if the state ID
+/// space is exhausted, but in order for that to happen, the range trie itself
+/// would use well over 100GB of memory. Moreover, it's likely impossible
+/// for the state ID space to get that big. In fact, it's likely that even a
+/// u16 would be good enough here. But it's not quite clear how to prove this.
+type StateID = u32;
+
+/// There is only one final state in this trie. Every sequence of byte ranges
+/// added shares the same final state.
+const FINAL: StateID = 0;
+
+/// The root state of the trie.
+const ROOT: StateID = 1;
+
+/// A range trie represents an ordered set of sequences of bytes.
+///
+/// A range trie accepts as input a sequence of byte ranges and merges
+/// them into the existing set such that the trie can produce a sorted
+/// non-overlapping sequence of byte ranges. The sequence emitted corresponds
+/// precisely to the sequence of bytes matched by the given keys, although the
+/// byte ranges themselves may be split at different boundaries.
+///
+/// The order complexity of this data structure seems difficult to analyze.
+/// If the size of a byte is held as a constant, then insertion is clearly
+/// O(n) where n is the number of byte ranges in the input key. However, if
+/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In
+/// particular it seems possible for pathological inputs to cause insertion
+/// to do a lot of work. However, for what we use this data structure for,
+/// there should be no pathological inputs since the ultimate source is always
+/// a sorted set of Unicode scalar value ranges.
+///
+/// Internally, this trie is setup like a finite state machine. Note though
+/// that it is acyclic.
+#[derive(Clone)]
+pub struct RangeTrie {
+    /// The states in this trie. The first is always the shared final state.
+    /// The second is always the root state. Otherwise, there is no
+    /// particular order.
+    states: Vec<State>,
+    /// A free-list of states. When a range trie is cleared, all of its states
+    /// are added to list. Creating a new state reuses states from this list
+    /// before allocating a new one.
+    free: Vec<State>,
+    /// A stack for traversing this trie to yield sequences of byte ranges in
+    /// lexicographic order.
+    iter_stack: RefCell<Vec<NextIter>>,
+    /// A bufer that stores the current sequence during iteration.
+    iter_ranges: RefCell<Vec<Utf8Range>>,
+    /// A stack used for traversing the trie in order to (deeply) duplicate
+    /// a state.
+    dupe_stack: Vec<NextDupe>,
+    /// A stack used for traversing the trie during insertion of a new
+    /// sequence of byte ranges.
+    insert_stack: Vec<NextInsert>,
+}
+
+/// A single state in this trie.
+#[derive(Clone)]
+struct State {
+    /// A sorted sequence of non-overlapping transitions to other states. Each
+    /// transition corresponds to a single range of bytes.
+    transitions: Vec<Transition>,
+}
+
+/// A transition is a single range of bytes. If a particular byte is in this
+/// range, then the corresponding machine may transition to the state pointed
+/// to by `next_id`.
+#[derive(Clone)]
+struct Transition {
+    /// The byte range.
+    range: Utf8Range,
+    /// The next state to transition to.
+    next_id: StateID,
+}
+
+impl RangeTrie {
+    /// Create a new empty range trie.
+    pub fn new() -> RangeTrie {
+        let mut trie = RangeTrie {
+            states: vec![],
+            free: vec![],
+            iter_stack: RefCell::new(vec![]),
+            iter_ranges: RefCell::new(vec![]),
+            dupe_stack: vec![],
+            insert_stack: vec![],
+        };
+        trie.clear();
+        trie
+    }
+
+    /// Clear this range trie such that it is empty. Clearing a range trie
+    /// and reusing it can beneficial because this may reuse allocations.
+    pub fn clear(&mut self) {
+        self.free.extend(self.states.drain(..));
+        self.add_empty(); // final
+        self.add_empty(); // root
+    }
+
+    /// Iterate over all of the sequences of byte ranges in this trie, and
+    /// call the provided function for each sequence. Iteration occurs in
+    /// lexicographic order.
+    pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) {
+        let mut stack = self.iter_stack.borrow_mut();
+        stack.clear();
+        let mut ranges = self.iter_ranges.borrow_mut();
+        ranges.clear();
+
+        // We do iteration in a way that permits us to use a single buffer
+        // for our keys. We iterate in a depth first fashion, while being
+        // careful to expand our frontier as we move deeper in the trie.
+        stack.push(NextIter { state_id: ROOT, tidx: 0 });
+        while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() {
+            // This could be implemented more simply without an inner loop
+            // here, but at the cost of more stack pushes.
+            loop {
+                let state = self.state(state_id);
+                // If we're visited all transitions in this state, then pop
+                // back to the parent state.
+                if tidx >= state.transitions.len() {
+                    ranges.pop();
+                    break;
+                }
+
+                let t = &state.transitions[tidx];
+                ranges.push(t.range);
+                if t.next_id == FINAL {
+                    f(&ranges);
+                    ranges.pop();
+                    tidx += 1;
+                } else {
+                    // Expand our frontier. Once we come back to this state
+                    // via the stack, start in on the next transition.
+                    stack.push(NextIter { state_id, tidx: tidx + 1 });
+                    // Otherwise, move to the first transition of the next
+                    // state.
+                    state_id = t.next_id;
+                    tidx = 0;
+                }
+            }
+        }
+    }
+
+    /// Inserts a new sequence of ranges into this trie.
+    ///
+    /// The sequence given must be non-empty and must not have a length
+    /// exceeding 4.
+    pub fn insert(&mut self, ranges: &[Utf8Range]) {
+        assert!(!ranges.is_empty());
+        assert!(ranges.len() <= 4);
+
+        let mut stack = mem::replace(&mut self.insert_stack, vec![]);
+        stack.clear();
+
+        stack.push(NextInsert::new(ROOT, ranges));
+        while let Some(next) = stack.pop() {
+            let (state_id, ranges) = (next.state_id(), next.ranges());
+            assert!(!ranges.is_empty());
+
+            let (mut new, rest) = (ranges[0], &ranges[1..]);
+
+            // i corresponds to the position of the existing transition on
+            // which we are operating. Typically, the result is to remove the
+            // transition and replace it with two or more new transitions
+            // corresponding to the partitions generated by splitting the
+            // 'new' with the ith transition's range.
+            let mut i = self.state(state_id).find(new);
+
+            // In this case, there is no overlap *and* the new range is greater
+            // than all existing ranges. So we can just add it to the end.
+            if i == self.state(state_id).transitions.len() {
+                let next_id = NextInsert::push(self, &mut stack, rest);
+                self.add_transition(state_id, new, next_id);
+                continue;
+            }
+
+            // The need for this loop is a bit subtle, buf basically, after
+            // we've handled the partitions from our initial split, it's
+            // possible that there will be a partition leftover that overlaps
+            // with a subsequent transition. If so, then we have to repeat
+            // the split process again with the leftovers and that subsequent
+            // transition.
+            'OUTER: loop {
+                let old = self.state(state_id).transitions[i].clone();
+                let split = match Split::new(old.range, new) {
+                    Some(split) => split,
+                    None => {
+                        let next_id = NextInsert::push(self, &mut stack, rest);
+                        self.add_transition_at(i, state_id, new, next_id);
+                        continue;
+                    }
+                };
+                let splits = split.as_slice();
+                // If we only have one partition, then the ranges must be
+                // equivalent. There's nothing to do here for this state, so
+                // just move on to the next one.
+                if splits.len() == 1 {
+                    // ... but only if we have anything left to do.
+                    if !rest.is_empty() {
+                        stack.push(NextInsert::new(old.next_id, rest));
+                    }
+                    break;
+                }
+                // At this point, we know that 'split' is non-empty and there
+                // must be some overlap AND that the two ranges are not
+                // equivalent. Therefore, the existing range MUST be removed
+                // and split up somehow. Instead of actually doing the removal
+                // and then a subsequent insertion---with all the memory
+                // shuffling that entails---we simply overwrite the transition
+                // at position `i` for the first new transition we want to
+                // insert. After that, we're forced to do expensive inserts.
+                let mut first = true;
+                let mut add_trans =
+                    |trie: &mut RangeTrie, pos, from, range, to| {
+                        if first {
+                            trie.set_transition_at(pos, from, range, to);
+                            first = false;
+                        } else {
+                            trie.add_transition_at(pos, from, range, to);
+                        }
+                    };
+                for (j, &srange) in splits.iter().enumerate() {
+                    match srange {
+                        SplitRange::Old(r) => {
+                            // Deep clone the state pointed to by the ith
+                            // transition. This is always necessary since 'old'
+                            // is always coupled with at least a 'both'
+                            // partition. We don't want any new changes made
+                            // via the 'both' partition to impact the part of
+                            // the transition that doesn't overlap with the
+                            // new range.
+                            let dup_id = self.duplicate(old.next_id);
+                            add_trans(self, i, state_id, r, dup_id);
+                        }
+                        SplitRange::New(r) => {
+                            // This is a bit subtle, but if this happens to be
+                            // the last partition in our split, it is possible
+                            // that this overlaps with a subsequent transition.
+                            // If it does, then we must repeat the whole
+                            // splitting process over again with `r` and the
+                            // subsequent transition.
+                            {
+                                let trans = &self.state(state_id).transitions;
+                                if j + 1 == splits.len()
+                                    && i < trans.len()
+                                    && intersects(r, trans[i].range)
+                                {
+                                    new = r;
+                                    continue 'OUTER;
+                                }
+                            }
+
+                            // ... otherwise, setup exploration for a new
+                            // empty state and add a brand new transition for
+                            // this new range.
+                            let next_id =
+                                NextInsert::push(self, &mut stack, rest);
+                            add_trans(self, i, state_id, r, next_id);
+                        }
+                        SplitRange::Both(r) => {
+                            // Continue adding the remaining ranges on this
+                            // path and update the transition with the new
+                            // range.
+                            if !rest.is_empty() {
+                                stack.push(NextInsert::new(old.next_id, rest));
+                            }
+                            add_trans(self, i, state_id, r, old.next_id);
+                        }
+                    }
+                    i += 1;
+                }
+                // If we've reached this point, then we know that there are
+                // no subsequent transitions with any overlap. Therefore, we
+                // can stop processing this range and move on to the next one.
+                break;
+            }
+        }
+        self.insert_stack = stack;
+    }
+
+    pub fn add_empty(&mut self) -> StateID {
+        if self.states.len() as u64 > u32::MAX as u64 {
+            // This generally should not happen since a range trie is only
+            // ever used to compile a single sequence of Unicode scalar values.
+            // If we ever got to this point, we would, at *minimum*, be using
+            // 96GB in just the range trie alone.
+            panic!("too many sequences added to range trie");
+        }
+        let id = self.states.len() as StateID;
+        // If we have some free states available, then use them to avoid
+        // more allocations.
+        if let Some(mut state) = self.free.pop() {
+            state.clear();
+            self.states.push(state);
+        } else {
+            self.states.push(State { transitions: vec![] });
+        }
+        id
+    }
+
+    /// Performs a deep clone of the given state and returns the duplicate's
+    /// state ID.
+    ///
+    /// A "deep clone" in this context means that the state given along with
+    /// recursively all states that it points to are copied. Once complete,
+    /// the given state ID and the returned state ID share nothing.
+    ///
+    /// This is useful during range trie insertion when a new range overlaps
+    /// with an existing range that is bigger than the new one. The part of
+    /// the existing range that does *not* overlap with the new one is that
+    /// duplicated so that adding the new range to the overlap doesn't disturb
+    /// the non-overlapping portion.
+    ///
+    /// There's one exception: if old_id is the final state, then it is not
+    /// duplicated and the same final state is returned. This is because all
+    /// final states in this trie are equivalent.
+    fn duplicate(&mut self, old_id: StateID) -> StateID {
+        if old_id == FINAL {
+            return FINAL;
+        }
+
+        let mut stack = mem::replace(&mut self.dupe_stack, vec![]);
+        stack.clear();
+
+        let new_id = self.add_empty();
+        // old_id is the state we're cloning and new_id is the ID of the
+        // duplicated state for old_id.
+        stack.push(NextDupe { old_id, new_id });
+        while let Some(NextDupe { old_id, new_id }) = stack.pop() {
+            for i in 0..self.state(old_id).transitions.len() {
+                let t = self.state(old_id).transitions[i].clone();
+                if t.next_id == FINAL {
+                    // All final states are the same, so there's no need to
+                    // duplicate it.
+                    self.add_transition(new_id, t.range, FINAL);
+                    continue;
+                }
+
+                let new_child_id = self.add_empty();
+                self.add_transition(new_id, t.range, new_child_id);
+                stack.push(NextDupe {
+                    old_id: t.next_id,
+                    new_id: new_child_id,
+                });
+            }
+        }
+        self.dupe_stack = stack;
+        new_id
+    }
+
+    /// Adds the given transition to the given state.
+    ///
+    /// Callers must ensure that all previous transitions in this state
+    /// are lexicographically smaller than the given range.
+    fn add_transition(
+        &mut self,
+        from_id: StateID,
+        range: Utf8Range,
+        next_id: StateID,
+    ) {
+        self.state_mut(from_id)
+            .transitions
+            .push(Transition { range, next_id });
+    }
+
+    /// Like `add_transition`, except this inserts the transition just before
+    /// the ith transition.
+    fn add_transition_at(
+        &mut self,
+        i: usize,
+        from_id: StateID,
+        range: Utf8Range,
+        next_id: StateID,
+    ) {
+        self.state_mut(from_id)
+            .transitions
+            .insert(i, Transition { range, next_id });
+    }
+
+    /// Overwrites the transition at position i with the given transition.
+    fn set_transition_at(
+        &mut self,
+        i: usize,
+        from_id: StateID,
+        range: Utf8Range,
+        next_id: StateID,
+    ) {
+        self.state_mut(from_id).transitions[i] = Transition { range, next_id };
+    }
+
+    /// Return an immutable borrow for the state with the given ID.
+    fn state(&self, id: StateID) -> &State {
+        &self.states[id as usize]
+    }
+
+    /// Return a mutable borrow for the state with the given ID.
+    fn state_mut(&mut self, id: StateID) -> &mut State {
+        &mut self.states[id as usize]
+    }
+}
+
+impl State {
+    /// Find the position at which the given range should be inserted in this
+    /// state.
+    ///
+    /// The position returned is always in the inclusive range
+    /// [0, transitions.len()]. If 'transitions.len()' is returned, then the
+    /// given range overlaps with no other range in this state *and* is greater
+    /// than all of them.
+    ///
+    /// For all other possible positions, the given range either overlaps
+    /// with the transition at that position or is otherwise less than it
+    /// with no overlap (and is greater than the previous transition). In the
+    /// former case, careful attention must be paid to inserting this range
+    /// as a new transition. In the latter case, the range can be inserted as
+    /// a new transition at the given position without disrupting any other
+    /// transitions.
+    fn find(&self, range: Utf8Range) -> usize {
+        /// Returns the position `i` at which `pred(xs[i])` first returns true
+        /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never
+        /// returns true, then `xs.len()` is returned.
+        ///
+        /// We roll our own binary search because it doesn't seem like the
+        /// standard library's binary search can be used here. Namely, if
+        /// there is an overlapping range, then we want to find the first such
+        /// occurrence, but there may be many. Or at least, it's not quite
+        /// clear to me how to do it.
+        fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
+        where
+            F: FnMut(&T) -> bool,
+        {
+            let (mut left, mut right) = (0, xs.len());
+            while left < right {
+                // Overflow is impossible because xs.len() <= 256.
+                let mid = (left + right) / 2;
+                if pred(&xs[mid]) {
+                    right = mid;
+                } else {
+                    left = mid + 1;
+                }
+            }
+            left
+        }
+
+        // Benchmarks suggest that binary search is just a bit faster than
+        // straight linear search. Specifically when using the debug tool:
+        //
+        //   hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+        binary_search(&self.transitions, |t| range.start <= t.range.end)
+    }
+
+    /// Clear this state such that it has zero transitions.
+    fn clear(&mut self) {
+        self.transitions.clear();
+    }
+}
+
+/// The next state to process during duplication.
+#[derive(Clone, Debug)]
+struct NextDupe {
+    /// The state we want to duplicate.
+    old_id: StateID,
+    /// The ID of the new state that is a duplicate of old_id.
+    new_id: StateID,
+}
+
+/// The next state (and its corresponding transition) that we want to visit
+/// during iteration in lexicographic order.
+#[derive(Clone, Debug)]
+struct NextIter {
+    state_id: StateID,
+    tidx: usize,
+}
+
+/// The next state to process during insertion and any remaining ranges that we
+/// want to add for a partcular sequence of ranges. The first such instance
+/// is always the root state along with all ranges given.
+#[derive(Clone, Debug)]
+struct NextInsert {
+    /// The next state to begin inserting ranges. This state should be the
+    /// state at which `ranges[0]` should be inserted.
+    state_id: StateID,
+    /// The ranges to insert. We used a fixed-size array here to avoid an
+    /// allocation.
+    ranges: [Utf8Range; 4],
+    /// The number of valid ranges in the above array.
+    len: u8,
+}
+
+impl NextInsert {
+    /// Create the next item to visit. The given state ID should correspond
+    /// to the state at which the first range in the given slice should be
+    /// inserted. The slice given must not be empty and it must be no longer
+    /// than 4.
+    fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert {
+        let len = ranges.len();
+        assert!(len > 0);
+        assert!(len <= 4);
+
+        let mut tmp = [Utf8Range { start: 0, end: 0 }; 4];
+        tmp[..len].copy_from_slice(ranges);
+        NextInsert { state_id, ranges: tmp, len: len as u8 }
+    }
+
+    /// Push a new empty state to visit along with any remaining ranges that
+    /// still need to be inserted. The ID of the new empty state is returned.
+    ///
+    /// If ranges is empty, then no new state is created and FINAL is returned.
+    fn push(
+        trie: &mut RangeTrie,
+        stack: &mut Vec<NextInsert>,
+        ranges: &[Utf8Range],
+    ) -> StateID {
+        if ranges.is_empty() {
+            FINAL
+        } else {
+            let next_id = trie.add_empty();
+            stack.push(NextInsert::new(next_id, ranges));
+            next_id
+        }
+    }
+
+    /// Return the ID of the state to visit.
+    fn state_id(&self) -> StateID {
+        self.state_id
+    }
+
+    /// Return the remaining ranges to insert.
+    fn ranges(&self) -> &[Utf8Range] {
+        &self.ranges[..self.len as usize]
+    }
+}
+
+/// Split represents a partitioning of two ranges into one or more ranges. This
+/// is the secret sauce that makes a range trie work, as it's what tells us
+/// how to deal with two overlapping but unequal ranges during insertion.
+///
+/// Essentially, either two ranges overlap or they don't. If they don't, then
+/// handling insertion is easy: just insert the new range into its
+/// lexicographically correct position. Since it does not overlap with anything
+/// else, no other transitions are impacted by the new range.
+///
+/// If they do overlap though, there are generally three possible cases to
+/// handle:
+///
+/// 1. The part where the two ranges actually overlap. i.e., The intersection.
+/// 2. The part of the existing range that is not in the the new range.
+/// 3. The part of the new range that is not in the old range.
+///
+/// (1) is guaranteed to always occur since all overlapping ranges have a
+/// non-empty intersection. If the two ranges are not equivalent, then at
+/// least one of (2) or (3) is guaranteed to occur as well. In some cases,
+/// e.g., `[0-4]` and `[4-9]`, all three cases will occur.
+///
+/// This `Split` type is responsible for providing (1), (2) and (3) for any
+/// possible pair of byte ranges.
+///
+/// As for insertion, for the overlap in (1), the remaining ranges to insert
+/// should be added by following the corresponding transition. However, this
+/// should only be done for the overlapping parts of the range. If there was
+/// a part of the existing range that was not in the new range, then that
+/// existing part must be split off from the transition and duplicated. The
+/// remaining parts of the overlap can then be added to using the new ranges
+/// without disturbing the existing range.
+///
+/// Handling the case for the part of a new range that is not in an existing
+/// range is seemingly easy. Just treat it as if it were a non-overlapping
+/// range. The problem here is that if this new non-overlapping range occurs
+/// after both (1) and (2), then it's possible that it can overlap with the
+/// next transition in the current state. If it does, then the whole process
+/// must be repeated!
+///
+/// # Details of the 3 cases
+///
+/// The following details the various cases that are implemented in code
+/// below. It's plausible that the number of cases is not actually minimal,
+/// but it's important for this code to remain at least somewhat readable.
+///
+/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define
+/// the follow distinct relationships where at least one must apply. The order
+/// of these matters, since multiple can match. The first to match applies.
+///
+///   1. b < x <=> [a,b] < [x,y]
+///   2. y < a <=> [x,y] < [a,b]
+///
+/// In the case of (1) and (2), these are the only cases where there is no
+/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In
+/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The
+/// intersection in all of the following cases is non-empty.
+///
+///    3. a = x && b = y <=> [a,b] == [x,y]
+///    4. a = x && b < y <=> [x,y] right-extends [a,b]
+///    5. b = y && a > x <=> [x,y] left-extends [a,b]
+///    6. x = a && y < b <=> [a,b] right-extends [x,y]
+///    7. y = b && x > a <=> [a,b] left-extends [x,y]
+///    8. a > x && b < y <=> [x,y] covers [a,b]
+///    9. x > a && y < b <=> [a,b] covers [x,y]
+///   10. b = x && a < y <=> [a,b] is left-adjacent to [x,y]
+///   11. y = a && x < b <=> [x,y] is left-adjacent to [a,b]
+///   12. b > x && b < y <=> [a,b] left-overlaps [x,y]
+///   13. y > a && y < b <=> [x,y] left-overlaps [a,b]
+///
+/// In cases 3-13, we can form rules that partition the ranges into a
+/// non-overlapping ordered sequence of ranges:
+///
+///    3. [a,b]
+///    4. [a,b], [b+1,y]
+///    5. [x,a-1], [a,b]
+///    6. [x,y], [y+1,b]
+///    7. [a,x-1], [x,y]
+///    8. [x,a-1], [a,b], [b+1,y]
+///    9. [a,x-1], [x,y], [y+1,b]
+///   10. [a,b-1], [b,b], [b+1,y]
+///   11. [x,y-1], [y,y], [y+1,b]
+///   12. [a,x-1], [x,b], [b+1,y]
+///   13. [x,a-1], [a,y], [y+1,b]
+///
+/// In the code below, we go a step further and identify each of the above
+/// outputs as belonging either to the overlap of the two ranges or to one
+/// of [a,b] or [x,y] exclusively.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct Split {
+    partitions: [SplitRange; 3],
+    len: usize,
+}
+
+/// A tagged range indicating how it was derived from a pair of ranges.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum SplitRange {
+    Old(Utf8Range),
+    New(Utf8Range),
+    Both(Utf8Range),
+}
+
+impl Split {
+    /// Create a partitioning of the given ranges.
+    ///
+    /// If the given ranges have an empty intersection, then None is returned.
+    fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> {
+        let range = |r: RangeInclusive<u8>| Utf8Range {
+            start: *r.start(),
+            end: *r.end(),
+        };
+        let old = |r| SplitRange::Old(range(r));
+        let new = |r| SplitRange::New(range(r));
+        let both = |r| SplitRange::Both(range(r));
+
+        // Use same names as the comment above to make it easier to compare.
+        let (a, b, x, y) = (o.start, o.end, n.start, n.end);
+
+        if b < x || y < a {
+            // case 1, case 2
+            None
+        } else if a == x && b == y {
+            // case 3
+            Some(Split::parts1(both(a..=b)))
+        } else if a == x && b < y {
+            // case 4
+            Some(Split::parts2(both(a..=b), new(b + 1..=y)))
+        } else if b == y && a > x {
+            // case 5
+            Some(Split::parts2(new(x..=a - 1), both(a..=b)))
+        } else if x == a && y < b {
+            // case 6
+            Some(Split::parts2(both(x..=y), old(y + 1..=b)))
+        } else if y == b && x > a {
+            // case 7
+            Some(Split::parts2(old(a..=x - 1), both(x..=y)))
+        } else if a > x && b < y {
+            // case 8
+            Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y)))
+        } else if x > a && y < b {
+            // case 9
+            Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b)))
+        } else if b == x && a < y {
+            // case 10
+            Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y)))
+        } else if y == a && x < b {
+            // case 11
+            Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b)))
+        } else if b > x && b < y {
+            // case 12
+            Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y)))
+        } else if y > a && y < b {
+            // case 13
+            Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b)))
+        } else {
+            unreachable!()
+        }
+    }
+
+    /// Create a new split with a single partition. This only occurs when two
+    /// ranges are equivalent.
+    fn parts1(r1: SplitRange) -> Split {
+        // This value doesn't matter since it is never accessed.
+        let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+        Split { partitions: [r1, nada, nada], len: 1 }
+    }
+
+    /// Create a new split with two partitions.
+    fn parts2(r1: SplitRange, r2: SplitRange) -> Split {
+        // This value doesn't matter since it is never accessed.
+        let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+        Split { partitions: [r1, r2, nada], len: 2 }
+    }
+
+    /// Create a new split with three partitions.
+    fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split {
+        Split { partitions: [r1, r2, r3], len: 3 }
+    }
+
+    /// Return the partitions in this split as a slice.
+    fn as_slice(&self) -> &[SplitRange] {
+        &self.partitions[..self.len]
+    }
+}
+
+impl fmt::Debug for RangeTrie {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "")?;
+        for (i, state) in self.states.iter().enumerate() {
+            let status = if i == FINAL as usize { '*' } else { ' ' };
+            writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+        }
+        Ok(())
+    }
+}
+
+impl fmt::Debug for State {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let rs = self
+            .transitions
+            .iter()
+            .map(|t| format!("{:?}", t))
+            .collect::<Vec<String>>()
+            .join(", ");
+        write!(f, "{}", rs)
+    }
+}
+
+impl fmt::Debug for Transition {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        if self.range.start == self.range.end {
+            write!(f, "{:02X} => {:02X}", self.range.start, self.next_id)
+        } else {
+            write!(
+                f,
+                "{:02X}-{:02X} => {:02X}",
+                self.range.start, self.range.end, self.next_id
+            )
+        }
+    }
+}
+
+/// Returns true if and only if the given ranges intersect.
+fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool {
+    !(r1.end < r2.start || r2.end < r1.start)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::ops::RangeInclusive;
+
+    use regex_syntax::utf8::Utf8Range;
+
+    use super::*;
+
+    fn r(range: RangeInclusive<u8>) -> Utf8Range {
+        Utf8Range { start: *range.start(), end: *range.end() }
+    }
+
+    fn split_maybe(
+        old: RangeInclusive<u8>,
+        new: RangeInclusive<u8>,
+    ) -> Option<Split> {
+        Split::new(r(old), r(new))
+    }
+
+    fn split(
+        old: RangeInclusive<u8>,
+        new: RangeInclusive<u8>,
+    ) -> Vec<SplitRange> {
+        split_maybe(old, new).unwrap().as_slice().to_vec()
+    }
+
+    #[test]
+    fn no_splits() {
+        // case 1
+        assert_eq!(None, split_maybe(0..=1, 2..=3));
+        // case 2
+        assert_eq!(None, split_maybe(2..=3, 0..=1));
+    }
+
+    #[test]
+    fn splits() {
+        let range = |r: RangeInclusive<u8>| Utf8Range {
+            start: *r.start(),
+            end: *r.end(),
+        };
+        let old = |r| SplitRange::Old(range(r));
+        let new = |r| SplitRange::New(range(r));
+        let both = |r| SplitRange::Both(range(r));
+
+        // case 3
+        assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]);
+        assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]);
+
+        // case 4
+        assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]);
+        assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]);
+        assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]);
+
+        // case 5
+        assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]);
+        assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]);
+        assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]);
+
+        // case 6
+        assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]);
+        assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]);
+        assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]);
+
+        // case 7
+        assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]);
+        assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]);
+        assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]);
+
+        // case 8
+        assert_eq!(
+            split(3..=6, 2..=7),
+            vec![new(2..=2), both(3..=6), new(7..=7)],
+        );
+        assert_eq!(
+            split(3..=6, 1..=8),
+            vec![new(1..=2), both(3..=6), new(7..=8)],
+        );
+
+        // case 9
+        assert_eq!(
+            split(2..=7, 3..=6),
+            vec![old(2..=2), both(3..=6), old(7..=7)],
+        );
+        assert_eq!(
+            split(1..=8, 3..=6),
+            vec![old(1..=2), both(3..=6), old(7..=8)],
+        );
+
+        // case 10
+        assert_eq!(
+            split(3..=6, 6..=7),
+            vec![old(3..=5), both(6..=6), new(7..=7)],
+        );
+        assert_eq!(
+            split(3..=6, 6..=8),
+            vec![old(3..=5), both(6..=6), new(7..=8)],
+        );
+        assert_eq!(
+            split(5..=6, 6..=7),
+            vec![old(5..=5), both(6..=6), new(7..=7)],
+        );
+
+        // case 11
+        assert_eq!(
+            split(6..=7, 3..=6),
+            vec![new(3..=5), both(6..=6), old(7..=7)],
+        );
+        assert_eq!(
+            split(6..=8, 3..=6),
+            vec![new(3..=5), both(6..=6), old(7..=8)],
+        );
+        assert_eq!(
+            split(6..=7, 5..=6),
+            vec![new(5..=5), both(6..=6), old(7..=7)],
+        );
+
+        // case 12
+        assert_eq!(
+            split(3..=7, 5..=9),
+            vec![old(3..=4), both(5..=7), new(8..=9)],
+        );
+        assert_eq!(
+            split(3..=5, 4..=6),
+            vec![old(3..=3), both(4..=5), new(6..=6)],
+        );
+
+        // case 13
+        assert_eq!(
+            split(5..=9, 3..=7),
+            vec![new(3..=4), both(5..=7), old(8..=9)],
+        );
+        assert_eq!(
+            split(4..=6, 3..=5),
+            vec![new(3..=3), both(4..=5), old(6..=6)],
+        );
+    }
+
+    // Arguably there should be more tests here, but in practice, this data
+    // structure is well covered by the huge number of regex tests.
+}
diff --git a/src/regex.rs b/src/regex.rs
new file mode 100644
index 0000000..47e1c58
--- /dev/null
+++ b/src/regex.rs
@@ -0,0 +1,771 @@
+#[cfg(feature = "std")]
+use dense::{self, DenseDFA};
+use dfa::DFA;
+#[cfg(feature = "std")]
+use error::Result;
+#[cfg(feature = "std")]
+use sparse::SparseDFA;
+#[cfg(feature = "std")]
+use state_id::StateID;
+
+/// A regular expression that uses deterministic finite automata for fast
+/// searching.
+///
+/// A regular expression is comprised of two DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
+/// match while the reverse DFA is responsible for detecting the start of a
+/// match. Thus, in order to find the bounds of any given match, a forward
+/// search must first be run followed by a reverse search. A match found by
+/// the forward DFA guarantees that the reverse DFA will also find a match.
+///
+/// The type of the DFA used by a `Regex` corresponds to the `D` type
+/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
+/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
+/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
+/// search faster, while sparse DFAs use less memory but search more slowly.
+///
+/// By default, a regex's DFA type parameter is set to
+/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
+/// most convenient type that gives the best search performance.
+///
+/// # Sparse DFAs
+///
+/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
+/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
+/// enough to build corresponding sparse DFAs, and then build a regex from
+/// them:
+///
+/// ```
+/// use regex_automata::Regex;
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// // First, build a regex that uses dense DFAs.
+/// let dense_re = Regex::new("foo[0-9]+")?;
+///
+/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+/// let fwd = dense_re.forward().to_sparse()?;
+/// let rev = dense_re.reverse().to_sparse()?;
+///
+/// // Third, build a new regex from the constituent sparse DFAs.
+/// let sparse_re = Regex::from_dfas(fwd, rev);
+///
+/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+/// assert_eq!(true, sparse_re.is_match(b"foo123"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+#[cfg(feature = "std")]
+#[derive(Clone, Debug)]
+pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
+    forward: D,
+    reverse: D,
+}
+
+/// A regular expression that uses deterministic finite automata for fast
+/// searching.
+///
+/// A regular expression is comprised of two DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
+/// match while the reverse DFA is responsible for detecting the start of a
+/// match. Thus, in order to find the bounds of any given match, a forward
+/// search must first be run followed by a reverse search. A match found by
+/// the forward DFA guarantees that the reverse DFA will also find a match.
+///
+/// The type of the DFA used by a `Regex` corresponds to the `D` type
+/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
+/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
+/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
+/// search faster, while sparse DFAs use less memory but search more slowly.
+///
+/// When using this crate without the standard library, the `Regex` type has
+/// no default type parameter.
+///
+/// # Sparse DFAs
+///
+/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
+/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
+/// enough to build corresponding sparse DFAs, and then build a regex from
+/// them:
+///
+/// ```
+/// use regex_automata::Regex;
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// // First, build a regex that uses dense DFAs.
+/// let dense_re = Regex::new("foo[0-9]+")?;
+///
+/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+/// let fwd = dense_re.forward().to_sparse()?;
+/// let rev = dense_re.reverse().to_sparse()?;
+///
+/// // Third, build a new regex from the constituent sparse DFAs.
+/// let sparse_re = Regex::from_dfas(fwd, rev);
+///
+/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+/// assert_eq!(true, sparse_re.is_match(b"foo123"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+#[cfg(not(feature = "std"))]
+#[derive(Clone, Debug)]
+pub struct Regex<D> {
+    forward: D,
+    reverse: D,
+}
+
+#[cfg(feature = "std")]
+impl Regex {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding regex.
+    ///
+    /// The default configuration uses `usize` for state IDs, premultiplies
+    /// them and reduces the alphabet size by splitting bytes into equivalence
+    /// classes. The underlying DFAs are *not* minimized.
+    ///
+    /// If you want a non-default configuration, then use the
+    /// [`RegexBuilder`](struct.RegexBuilder.html)
+    /// to set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let re = Regex::new("foo[0-9]+bar")?;
+    /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn new(pattern: &str) -> Result<Regex> {
+        RegexBuilder::new().build(pattern)
+    }
+}
+
+#[cfg(feature = "std")]
+impl Regex<SparseDFA<Vec<u8>, usize>> {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding regex using sparse DFAs.
+    ///
+    /// The default configuration uses `usize` for state IDs, reduces the
+    /// alphabet size by splitting bytes into equivalence classes. The
+    /// underlying DFAs are *not* minimized.
+    ///
+    /// If you want a non-default configuration, then use the
+    /// [`RegexBuilder`](struct.RegexBuilder.html)
+    /// to set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let re = Regex::new_sparse("foo[0-9]+bar")?;
+    /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn new_sparse(
+        pattern: &str,
+    ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
+        RegexBuilder::new().build_sparse(pattern)
+    }
+}
+
+impl<D: DFA> Regex<D> {
+    /// Returns true if and only if the given bytes match.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let re = Regex::new("foo[0-9]+bar")?;
+    /// assert_eq!(true, re.is_match(b"foo12345bar"));
+    /// assert_eq!(false, re.is_match(b"foobar"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn is_match(&self, input: &[u8]) -> bool {
+        self.is_match_at(input, 0)
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
+    ///
+    /// // Normally, the end of the leftmost first match here would be 3,
+    /// // but the shortest match semantics detect a match earlier.
+    /// let re = Regex::new("abc|a")?;
+    /// assert_eq!(Some(1), re.shortest_match(b"abc"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
+        self.shortest_match_at(input, 0)
+    }
+
+    /// Returns the start and end offset of the leftmost first match. If no
+    /// match exists, then `None` is returned.
+    ///
+    /// The "leftmost first" match corresponds to the match with the smallest
+    /// starting offset, but where the end offset is determined by preferring
+    /// earlier branches in the original regular expression. For example,
+    /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
+    /// match `Samwise` in `Samwise`.
+    ///
+    /// Generally speaking, the "leftmost first" match is how most backtracking
+    /// regular expressions tend to work. This is in contrast to POSIX-style
+    /// regular expressions that yield "leftmost longest" matches. Namely,
+    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+    /// leftmost longest semantics.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
+    ///
+    /// // Even though a match is found after reading the first byte (`a`),
+    /// // the leftmost first match semantics demand that we find the earliest
+    /// // match that prefers earlier parts of the pattern over latter parts.
+    /// let re = Regex::new("abc|a")?;
+    /// assert_eq!(Some((0, 3)), re.find(b"abc"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
+        self.find_at(input, 0)
+    }
+
+    /// Returns the same as `is_match`, but starts the search at the given
+    /// offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
+        self.forward().is_match_at(input, start)
+    }
+
+    /// Returns the same as `shortest_match`, but starts the search at the
+    /// given offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    pub fn shortest_match_at(
+        &self,
+        input: &[u8],
+        start: usize,
+    ) -> Option<usize> {
+        self.forward().shortest_match_at(input, start)
+    }
+
+    /// Returns the same as `find`, but starts the search at the given
+    /// offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    pub fn find_at(
+        &self,
+        input: &[u8],
+        start: usize,
+    ) -> Option<(usize, usize)> {
+        let end = match self.forward().find_at(input, start) {
+            None => return None,
+            Some(end) => end,
+        };
+        let start = self
+            .reverse()
+            .rfind(&input[start..end])
+            .map(|i| start + i)
+            .expect("reverse search must match if forward search does");
+        Some((start, end))
+    }
+
+    /// Returns an iterator over all non-overlapping leftmost first matches
+    /// in the given bytes. If no match exists, then the iterator yields no
+    /// elements.
+    ///
+    /// Note that if the regex can match the empty string, then it is
+    /// possible for the iterator to yield a zero-width match at a location
+    /// that is not a valid UTF-8 boundary (for example, between the code units
+    /// of a UTF-8 encoded codepoint). This can happen regardless of whether
+    /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
+    /// was enabled or not.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// let text = b"foo1 foo12 foo123";
+    /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+    /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
+        Matches::new(self, input)
+    }
+
+    /// Build a new regex from its constituent forward and reverse DFAs.
+    ///
+    /// This is useful when deserializing a regex from some arbitrary
+    /// memory region. This is also useful for building regexes from other
+    /// types of DFAs.
+    ///
+    /// # Example
+    ///
+    /// This example is a bit a contrived. The usual use of these methods
+    /// would involve serializing `initial_re` somewhere and then deserializing
+    /// it later to build a regex.
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let initial_re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(true, initial_re.is_match(b"foo123"));
+    ///
+    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
+    /// let re = Regex::from_dfas(fwd, rev);
+    /// assert_eq!(true, re.is_match(b"foo123"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    ///
+    /// This example shows how you might build smaller DFAs, and then use those
+    /// smaller DFAs to build a new regex.
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let initial_re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(true, initial_re.is_match(b"foo123"));
+    ///
+    /// let fwd = initial_re.forward().to_u16()?;
+    /// let rev = initial_re.reverse().to_u16()?;
+    /// let re = Regex::from_dfas(fwd, rev);
+    /// assert_eq!(true, re.is_match(b"foo123"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    ///
+    /// This example shows how to build a `Regex` that uses sparse DFAs instead
+    /// of dense DFAs:
+    ///
+    /// ```
+    /// use regex_automata::Regex;
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let initial_re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(true, initial_re.is_match(b"foo123"));
+    ///
+    /// let fwd = initial_re.forward().to_sparse()?;
+    /// let rev = initial_re.reverse().to_sparse()?;
+    /// let re = Regex::from_dfas(fwd, rev);
+    /// assert_eq!(true, re.is_match(b"foo123"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
+        Regex { forward, reverse }
+    }
+
+    /// Return the underlying DFA responsible for forward matching.
+    pub fn forward(&self) -> &D {
+        &self.forward
+    }
+
+    /// Return the underlying DFA responsible for reverse matching.
+    pub fn reverse(&self) -> &D {
+        &self.reverse
+    }
+}
+
+/// An iterator over all non-overlapping matches for a particular search.
+///
+/// The iterator yields a `(usize, usize)` value until no more matches could be
+/// found. The first `usize` is the start of the match (inclusive) while the
+/// second `usize` is the end of the match (exclusive).
+///
+/// `S` is the type used to represent state identifiers in the underlying
+/// regex. The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression value itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct Matches<'r, 't, D: DFA + 'r> {
+    re: &'r Regex<D>,
+    text: &'t [u8],
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+impl<'r, 't, D: DFA> Matches<'r, 't, D> {
+    fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
+        Matches { re, text, last_end: 0, last_match: None }
+    }
+}
+
+impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
+    type Item = (usize, usize);
+
+    fn next(&mut self) -> Option<(usize, usize)> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let (s, e) = match self.re.find_at(self.text, self.last_end) {
+            None => return None,
+            Some((s, e)) => (s, e),
+        };
+        if s == e {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = e + 1;
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(e) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = e;
+        }
+        self.last_match = Some(e);
+        Some((s, e))
+    }
+}
+
+/// A builder for a regex based on deterministic finite automatons.
+///
+/// This builder permits configuring several aspects of the construction
+/// process such as case insensitivity, Unicode support and various options
+/// that impact the size of the underlying DFAs. In some cases, options (like
+/// performing DFA minimization) can come with a substantial additional cost.
+///
+/// This builder generally constructs two DFAs, where one is responsible for
+/// finding the end of a match and the other is responsible for finding the
+/// start of a match. If you only need to detect whether something matched,
+/// or only the end of a match, then you should use a
+/// [`dense::Builder`](dense/struct.Builder.html)
+/// to construct a single DFA, which is cheaper than building two DFAs.
+#[cfg(feature = "std")]
+#[derive(Clone, Debug)]
+pub struct RegexBuilder {
+    dfa: dense::Builder,
+}
+
+#[cfg(feature = "std")]
+impl RegexBuilder {
+    /// Create a new regex builder with the default configuration.
+    pub fn new() -> RegexBuilder {
+        RegexBuilder { dfa: dense::Builder::new() }
+    }
+
+    /// Build a regex from the given pattern.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build(&self, pattern: &str) -> Result<Regex> {
+        self.build_with_size::<usize>(pattern)
+    }
+
+    /// Build a regex from the given pattern using sparse DFAs.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build_sparse(
+        &self,
+        pattern: &str,
+    ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
+        self.build_with_size_sparse::<usize>(pattern)
+    }
+
+    /// Build a regex from the given pattern using a specific representation
+    /// for the underlying DFA state IDs.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    ///
+    /// The representation of state IDs is determined by the `S` type
+    /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
+    /// or `usize`, where `usize` is the default used for `build`. The purpose
+    /// of specifying a representation for state IDs is to reduce the memory
+    /// footprint of the underlying DFAs.
+    ///
+    /// When using this routine, the chosen state ID representation will be
+    /// used throughout determinization and minimization, if minimization was
+    /// requested. Even if the minimized DFAs can fit into the chosen state ID
+    /// representation but the initial determinized DFA cannot, then this will
+    /// still return an error. To get a minimized DFA with a smaller state ID
+    /// representation, first build it with a bigger state ID representation,
+    /// and then shrink the sizes of the DFAs using one of its conversion
+    /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
+    /// Finally, reconstitute the regex via
+    /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
+    pub fn build_with_size<S: StateID>(
+        &self,
+        pattern: &str,
+    ) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
+        let forward = self.dfa.build_with_size(pattern)?;
+        let reverse = self
+            .dfa
+            .clone()
+            .anchored(true)
+            .reverse(true)
+            .longest_match(true)
+            .build_with_size(pattern)?;
+        Ok(Regex::from_dfas(forward, reverse))
+    }
+
+    /// Build a regex from the given pattern using a specific representation
+    /// for the underlying DFA state IDs using sparse DFAs.
+    pub fn build_with_size_sparse<S: StateID>(
+        &self,
+        pattern: &str,
+    ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
+        let re = self.build_with_size(pattern)?;
+        let fwd = re.forward().to_sparse()?;
+        let rev = re.reverse().to_sparse()?;
+        Ok(Regex::from_dfas(fwd, rev))
+    }
+
+    /// Set whether matching must be anchored at the beginning of the input.
+    ///
+    /// When enabled, a match must begin at the start of the input. When
+    /// disabled, the regex will act as if the pattern started with a `.*?`,
+    /// which enables a match to appear anywhere.
+    ///
+    /// By default this is disabled.
+    pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.anchored(yes);
+        self
+    }
+
+    /// Enable or disable the case insensitive flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `i` flag.
+    pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.case_insensitive(yes);
+        self
+    }
+
+    /// Enable verbose mode in the regular expression.
+    ///
+    /// When enabled, verbose mode permits insigificant whitespace in many
+    /// places in the regular expression, as well as comments. Comments are
+    /// started using `#` and continue until the end of the line.
+    ///
+    /// By default, this is disabled. It may be selectively enabled in the
+    /// regular expression by using the `x` flag regardless of this setting.
+    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.ignore_whitespace(yes);
+        self
+    }
+
+    /// Enable or disable the "dot matches any character" flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `s` flag.
+    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.dot_matches_new_line(yes);
+        self
+    }
+
+    /// Enable or disable the "swap greed" flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `U` flag.
+    pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.swap_greed(yes);
+        self
+    }
+
+    /// Enable or disable the Unicode flag (`u`) by default.
+    ///
+    /// By default this is **enabled**. It may alternatively be selectively
+    /// disabled in the regular expression itself via the `u` flag.
+    ///
+    /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
+    /// default), a regular expression will fail to parse if Unicode mode is
+    /// disabled and a sub-expression could possibly match invalid UTF-8.
+    pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.unicode(yes);
+        self
+    }
+
+    /// When enabled, the builder will permit the construction of a regular
+    /// expression that may match invalid UTF-8.
+    ///
+    /// When disabled (the default), the builder is guaranteed to produce a
+    /// regex that will only ever match valid UTF-8 (otherwise, the builder
+    /// will return an error).
+    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.allow_invalid_utf8(yes);
+        self
+    }
+
+    /// Set the nesting limit used for the regular expression parser.
+    ///
+    /// The nesting limit controls how deep the abstract syntax tree is allowed
+    /// to be. If the AST exceeds the given limit (e.g., with too many nested
+    /// groups), then an error is returned by the parser.
+    ///
+    /// The purpose of this limit is to act as a heuristic to prevent stack
+    /// overflow when building a finite automaton from a regular expression's
+    /// abstract syntax tree. In particular, construction currently uses
+    /// recursion. In the future, the implementation may stop using recursion
+    /// and this option will no longer be necessary.
+    ///
+    /// This limit is not checked until the entire AST is parsed. Therefore,
+    /// if callers want to put a limit on the amount of heap space used, then
+    /// they should impose a limit on the length, in bytes, of the concrete
+    /// pattern string. In particular, this is viable since the parser will
+    /// limit itself to heap space proportional to the lenth of the pattern
+    /// string.
+    ///
+    /// Note that a nest limit of `0` will return a nest limit error for most
+    /// patterns but not all. For example, a nest limit of `0` permits `a` but
+    /// not `ab`, since `ab` requires a concatenation AST item, which results
+    /// in a nest depth of `1`. In general, a nest limit is not something that
+    /// manifests in an obvious way in the concrete syntax, therefore, it
+    /// should not be used in a granular way.
+    pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
+        self.dfa.nest_limit(limit);
+        self
+    }
+
+    /// Minimize the underlying DFAs.
+    ///
+    /// When enabled, the DFAs powering the resulting regex will be minimized
+    /// such that it is as small as possible.
+    ///
+    /// Whether one enables minimization or not depends on the types of costs
+    /// you're willing to pay and how much you care about its benefits. In
+    /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+    /// space, where `n` is the number of DFA states and `k` is the alphabet
+    /// size. In practice, minimization can be quite costly in terms of both
+    /// space and time, so it should only be done if you're willing to wait
+    /// longer to produce a DFA. In general, you might want a minimal DFA in
+    /// the following circumstances:
+    ///
+    /// 1. You would like to optimize for the size of the automaton. This can
+    ///    manifest in one of two ways. Firstly, if you're converting the
+    ///    DFA into Rust code (or a table embedded in the code), then a minimal
+    ///    DFA will translate into a corresponding reduction in code  size, and
+    ///    thus, also the final compiled binary size. Secondly, if you are
+    ///    building many DFAs and putting them on the heap, you'll be able to
+    ///    fit more if they are smaller. Note though that building a minimal
+    ///    DFA itself requires additional space; you only realize the space
+    ///    savings once the minimal DFA is constructed (at which point, the
+    ///    space used for minimization is freed).
+    /// 2. You've observed that a smaller DFA results in faster match
+    ///    performance. Naively, this isn't guaranteed since there is no
+    ///    inherent difference between matching with a bigger-than-minimal
+    ///    DFA and a minimal DFA. However, a smaller DFA may make use of your
+    ///    CPU's cache more efficiently.
+    /// 3. You are trying to establish an equivalence between regular
+    ///    languages. The standard method for this is to build a minimal DFA
+    ///    for each language and then compare them. If the DFAs are equivalent
+    ///    (up to state renaming), then the languages are equivalent.
+    ///
+    /// This option is disabled by default.
+    pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.minimize(yes);
+        self
+    }
+
+    /// Premultiply state identifiers in the underlying DFA transition tables.
+    ///
+    /// When enabled, state identifiers are premultiplied to point to their
+    /// corresponding row in the DFA's transition table. That is, given the
+    /// `i`th state, its corresponding premultiplied identifier is `i * k`
+    /// where `k` is the alphabet size of the DFA. (The alphabet size is at
+    /// most 256, but is in practice smaller if byte classes is enabled.)
+    ///
+    /// When state identifiers are not premultiplied, then the identifier of
+    /// the `i`th state is `i`.
+    ///
+    /// The advantage of premultiplying state identifiers is that is saves
+    /// a multiplication instruction per byte when searching with the DFA.
+    /// This has been observed to lead to a 20% performance benefit in
+    /// micro-benchmarks.
+    ///
+    /// The primary disadvantage of premultiplying state identifiers is
+    /// that they require a larger integer size to represent. For example,
+    /// if your DFA has 200 states, then its premultiplied form requires
+    /// 16 bits to represent every possible state identifier, where as its
+    /// non-premultiplied form only requires 8 bits.
+    ///
+    /// This option is enabled by default.
+    pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.premultiply(yes);
+        self
+    }
+
+    /// Shrink the size of the underlying DFA alphabet by mapping bytes to
+    /// their equivalence classes.
+    ///
+    /// When enabled, each DFA will use a map from all possible bytes to their
+    /// corresponding equivalence class. Each equivalence class represents a
+    /// set of bytes that does not discriminate between a match and a non-match
+    /// in the DFA. For example, the pattern `[ab]+` has at least two
+    /// equivalence classes: a set containing `a` and `b` and a set containing
+    /// every byte except for `a` and `b`. `a` and `b` are in the same
+    /// equivalence classes because they never discriminate between a match
+    /// and a non-match.
+    ///
+    /// The advantage of this map is that the size of the transition table can
+    /// be reduced drastically from `#states * 256 * sizeof(id)` to
+    /// `#states * k * sizeof(id)` where `k` is the number of equivalence
+    /// classes. As a result, total space usage can decrease substantially.
+    /// Moreover, since a smaller alphabet is used, compilation becomes faster
+    /// as well.
+    ///
+    /// The disadvantage of this map is that every byte searched must be
+    /// passed through this map before it can be used to determine the next
+    /// transition. This has a small match time performance cost.
+    ///
+    /// This option is enabled by default.
+    pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.byte_classes(yes);
+        self
+    }
+
+    /// Apply best effort heuristics to shrink the NFA at the expense of more
+    /// time/memory.
+    ///
+    /// This may be exposed in the future, but for now is exported for use in
+    /// the `regex-automata-debug` tool.
+    #[doc(hidden)]
+    pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
+        self.dfa.shrink(yes);
+        self
+    }
+}
+
+#[cfg(feature = "std")]
+impl Default for RegexBuilder {
+    fn default() -> RegexBuilder {
+        RegexBuilder::new()
+    }
+}
diff --git a/src/sparse.rs b/src/sparse.rs
new file mode 100644
index 0000000..d18024b
--- /dev/null
+++ b/src/sparse.rs
@@ -0,0 +1,1256 @@
+#[cfg(feature = "std")]
+use core::fmt;
+#[cfg(feature = "std")]
+use core::iter;
+use core::marker::PhantomData;
+use core::mem::size_of;
+#[cfg(feature = "std")]
+use std::collections::HashMap;
+
+#[cfg(feature = "std")]
+use byteorder::{BigEndian, LittleEndian};
+use byteorder::{ByteOrder, NativeEndian};
+
+use classes::ByteClasses;
+use dense;
+use dfa::DFA;
+#[cfg(feature = "std")]
+use error::{Error, Result};
+#[cfg(feature = "std")]
+use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID};
+#[cfg(not(feature = "std"))]
+use state_id::{dead_id, StateID};
+
+/// A sparse table-based deterministic finite automaton (DFA).
+///
+/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a
+/// more space efficient representation for its transition table. Consequently,
+/// sparse DFAs can use much less memory than dense DFAs, but this comes at a
+/// price. In particular, reading the more space efficient transitions takes
+/// more work, and consequently, searching using a sparse DFA is typically
+/// slower than a dense DFA.
+///
+/// A sparse DFA can be built using the default configuration via the
+/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise,
+/// one can configure various aspects of a dense DFA via
+/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense
+/// DFA to a sparse DFA using
+/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse).
+///
+/// In general, a sparse DFA supports all the same operations as a dense DFA.
+///
+/// Making the choice between a dense and sparse DFA depends on your specific
+/// work load. If you can sacrifice a bit of search time performance, then a
+/// sparse DFA might be the best choice. In particular, while sparse DFAs are
+/// probably always slower than dense DFAs, you may find that they are easily
+/// fast enough for your purposes!
+///
+/// # State size
+///
+/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to
+/// the type of the DFA's transition table while `S` corresponds to the
+/// representation used for the DFA's state identifiers as described by the
+/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
+/// `usize`, but other valid choices provided by this crate include `u8`,
+/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
+/// identifier representation than the default is to reduce the amount of
+/// memory used by a DFA. Note though, that if the chosen representation cannot
+/// accommodate the size of your DFA, then building the DFA will fail and
+/// return an error.
+///
+/// While the reduction in heap memory used by a DFA is one reason for choosing
+/// a smaller state identifier representation, another possible reason is for
+/// decreasing the serialization size of a DFA, as returned by
+/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian),
+/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian)
+/// or
+/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
+///
+/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`,
+/// depending on where the transition table is stored. Note that this is
+/// different than a dense DFA, whose transition table is typically
+/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads
+/// its transition table from raw bytes because the table is compactly packed.
+///
+/// # Variants
+///
+/// This DFA is defined as a non-exhaustive enumeration of different types of
+/// dense DFAs. All of the variants use the same internal representation
+/// for the transition table, but they vary in how the transition table is
+/// read. A DFA's specific variant depends on the configuration options set via
+/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
+/// `ByteClass`.
+///
+/// # The `DFA` trait
+///
+/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
+/// can be used for searching. For example:
+///
+/// ```
+/// use regex_automata::{DFA, SparseDFA};
+///
+/// # fn example() -> Result<(), regex_automata::Error> {
+/// let dfa = SparseDFA::new("foo[0-9]+")?;
+/// assert_eq!(Some(8), dfa.find(b"foo12345"));
+/// # Ok(()) }; example().unwrap()
+/// ```
+///
+/// The `DFA` trait also provides an assortment of other lower level methods
+/// for DFAs, such as `start_state` and `next_state`. While these are correctly
+/// implemented, it is an anti-pattern to use them in performance sensitive
+/// code on the `SparseDFA` type directly. Namely, each implementation requires
+/// a branch to determine which type of sparse DFA is being used. Instead,
+/// this branch should be pushed up a layer in the code since walking the
+/// transitions of a DFA is usually a hot path. If you do need to use these
+/// lower level methods in performance critical code, then you should match on
+/// the variants of this DFA and use each variant's implementation of the `DFA`
+/// trait directly.
+#[derive(Clone, Debug)]
+pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
+    /// A standard DFA that does not use byte classes.
+    Standard(Standard<T, S>),
+    /// A DFA that shrinks its alphabet to a set of equivalence classes instead
+    /// of using all possible byte values. Any two bytes belong to the same
+    /// equivalence class if and only if they can be used interchangeably
+    /// anywhere in the DFA while never discriminating between a match and a
+    /// non-match.
+    ///
+    /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much
+    /// from using byte classes. In some cases, using byte classes can even
+    /// marginally increase the size of a sparse DFA's transition table. The
+    /// reason for this is that a sparse DFA already compacts each state's
+    /// transitions separate from whether byte classes are used.
+    ByteClass(ByteClass<T, S>),
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+#[cfg(feature = "std")]
+impl SparseDFA<Vec<u8>, usize> {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding sparse DFA.
+    ///
+    /// The default configuration uses `usize` for state IDs and reduces the
+    /// alphabet size by splitting bytes into equivalence classes. The
+    /// resulting DFA is *not* minimized.
+    ///
+    /// If you want a non-default configuration, then use the
+    /// [`dense::Builder`](dense/struct.Builder.html)
+    /// to set your own configuration, and then call
+    /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse)
+    /// to create a sparse DFA.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{DFA, SparseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa = SparseDFA::new("foo[0-9]+bar")?;
+    /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
+        dense::Builder::new()
+            .build(pattern)
+            .and_then(|dense| dense.to_sparse())
+    }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> SparseDFA<Vec<u8>, S> {
+    /// Create a new empty sparse DFA that never matches any input.
+    ///
+    /// # Example
+    ///
+    /// In order to build an empty DFA, callers must provide a type hint
+    /// indicating their choice of state identifier representation.
+    ///
+    /// ```
+    /// use regex_automata::{DFA, SparseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty();
+    /// assert_eq!(None, dfa.find(b""));
+    /// assert_eq!(None, dfa.find(b"foo"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub fn empty() -> SparseDFA<Vec<u8>, S> {
+        dense::DenseDFA::empty().to_sparse().unwrap()
+    }
+
+    pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
+        dfa: &dense::Repr<T, S>,
+    ) -> Result<SparseDFA<Vec<u8>, A>> {
+        Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
+    }
+}
+
+impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
+    /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
+    /// DFA returned always uses `&[u8]` for its transition table while keeping
+    /// the same state identifier representation.
+    pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
+        match *self {
+            SparseDFA::Standard(Standard(ref r)) => {
+                SparseDFA::Standard(Standard(r.as_ref()))
+            }
+            SparseDFA::ByteClass(ByteClass(ref r)) => {
+                SparseDFA::ByteClass(ByteClass(r.as_ref()))
+            }
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    /// Return an owned version of this sparse DFA. Specifically, the DFA
+    /// returned always uses `Vec<u8>` for its transition table while keeping
+    /// the same state identifier representation.
+    ///
+    /// Effectively, this returns a sparse DFA whose transition table lives
+    /// on the heap.
+    #[cfg(feature = "std")]
+    pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
+        match *self {
+            SparseDFA::Standard(Standard(ref r)) => {
+                SparseDFA::Standard(Standard(r.to_owned()))
+            }
+            SparseDFA::ByteClass(ByteClass(ref r)) => {
+                SparseDFA::ByteClass(ByteClass(r.to_owned()))
+            }
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    /// Returns the memory usage, in bytes, of this DFA.
+    ///
+    /// The memory usage is computed based on the number of bytes used to
+    /// represent this DFA's transition table. This typically corresponds to
+    /// heap memory usage.
+    ///
+    /// This does **not** include the stack size used up by this DFA. To
+    /// compute that, used `std::mem::size_of::<SparseDFA>()`.
+    pub fn memory_usage(&self) -> usize {
+        self.repr().memory_usage()
+    }
+
+    fn repr(&self) -> &Repr<T, S> {
+        match *self {
+            SparseDFA::Standard(ref r) => &r.0,
+            SparseDFA::ByteClass(ref r) => &r.0,
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+/// Routines for converting a sparse DFA to other representations, such as
+/// smaller state identifiers or raw bytes suitable for persistent storage.
+#[cfg(feature = "std")]
+impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
+    /// Create a new sparse DFA whose match semantics are equivalent to
+    /// this DFA, but attempt to use `u8` for the representation of state
+    /// identifiers. If `u8` is insufficient to represent all state identifiers
+    /// in this DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u8>()`.
+    pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
+        self.to_sized()
+    }
+
+    /// Create a new sparse DFA whose match semantics are equivalent to
+    /// this DFA, but attempt to use `u16` for the representation of state
+    /// identifiers. If `u16` is insufficient to represent all state
+    /// identifiers in this DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u16>()`.
+    pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
+        self.to_sized()
+    }
+
+    /// Create a new sparse DFA whose match semantics are equivalent to
+    /// this DFA, but attempt to use `u32` for the representation of state
+    /// identifiers. If `u32` is insufficient to represent all state
+    /// identifiers in this DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u32>()`.
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
+        self.to_sized()
+    }
+
+    /// Create a new sparse DFA whose match semantics are equivalent to
+    /// this DFA, but attempt to use `u64` for the representation of state
+    /// identifiers. If `u64` is insufficient to represent all state
+    /// identifiers in this DFA, then this returns an error.
+    ///
+    /// This is a convenience routine for `to_sized::<u64>()`.
+    #[cfg(target_pointer_width = "64")]
+    pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
+        self.to_sized()
+    }
+
+    /// Create a new sparse DFA whose match semantics are equivalent to
+    /// this DFA, but attempt to use `A` for the representation of state
+    /// identifiers. If `A` is insufficient to represent all state identifiers
+    /// in this DFA, then this returns an error.
+    ///
+    /// An alternative way to construct such a DFA is to use
+    /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized).
+    /// In general, picking the appropriate size upon initial construction of
+    /// a sparse DFA is preferred, since it will do the conversion in one
+    /// step instead of two.
+    pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
+        self.repr().to_sized().map(|r| r.into_sparse_dfa())
+    }
+
+    /// Serialize a sparse DFA to raw bytes in little endian format.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
+        self.repr().to_bytes::<LittleEndian>()
+    }
+
+    /// Serialize a sparse DFA to raw bytes in big endian format.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
+        self.repr().to_bytes::<BigEndian>()
+    }
+
+    /// Serialize a sparse DFA to raw bytes in native endian format.
+    /// Generally, it is better to pick an explicit endianness using either
+    /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is
+    /// useful in tests where the DFA is serialized and deserialized on the
+    /// same platform.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
+        self.repr().to_bytes::<NativeEndian>()
+    }
+}
+
+impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
+    /// Deserialize a sparse DFA with a specific state identifier
+    /// representation.
+    ///
+    /// Deserializing a DFA using this routine will never allocate heap memory.
+    /// This is also guaranteed to be a constant time operation that does not
+    /// vary with the size of the DFA.
+    ///
+    /// The bytes given should be generated by the serialization of a DFA with
+    /// either the
+    /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
+    /// method or the
+    /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
+    /// endian, depending on the endianness of the machine you are
+    /// deserializing this DFA from.
+    ///
+    /// If the state identifier representation is `usize`, then deserialization
+    /// is dependent on the pointer size. For this reason, it is best to
+    /// serialize DFAs using a fixed size representation for your state
+    /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
+    ///
+    /// # Panics
+    ///
+    /// The bytes given should be *trusted*. In particular, if the bytes
+    /// are not a valid serialization of a DFA, or if the endianness of the
+    /// serialized bytes is different than the endianness of the machine that
+    /// is deserializing the DFA, then this routine will panic. Moreover, it
+    /// is possible for this deserialization routine to succeed even if the
+    /// given bytes do not represent a valid serialized sparse DFA.
+    ///
+    /// # Safety
+    ///
+    /// This routine is unsafe because it permits callers to provide an
+    /// arbitrary transition table with possibly incorrect transitions. While
+    /// the various serialization routines will never return an incorrect
+    /// transition table, there is no guarantee that the bytes provided here
+    /// are correct. While deserialization does many checks (as documented
+    /// above in the panic conditions), this routine does not check that the
+    /// transition table is correct. Given an incorrect transition table, it is
+    /// possible for the search routines to access out-of-bounds memory because
+    /// of explicit bounds check elision.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize a DFA to raw bytes, deserialize it
+    /// and then use it for searching. Note that we first convert the DFA to
+    /// using `u16` for its state identifier representation before serializing
+    /// it. While this isn't strictly necessary, it's good practice in order to
+    /// decrease the size of the DFA and to avoid platform specific pitfalls
+    /// such as differing pointer sizes.
+    ///
+    /// ```
+    /// use regex_automata::{DFA, DenseDFA, SparseDFA};
+    ///
+    /// # fn example() -> Result<(), regex_automata::Error> {
+    /// let sparse = SparseDFA::new("foo[0-9]+")?;
+    /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?;
+    ///
+    /// let dfa: SparseDFA<&[u8], u16> = unsafe {
+    ///     SparseDFA::from_bytes(&bytes)
+    /// };
+    ///
+    /// assert_eq!(Some(8), dfa.find(b"foo12345"));
+    /// # Ok(()) }; example().unwrap()
+    /// ```
+    pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
+        Repr::from_bytes(buf).into_sparse_dfa()
+    }
+}
+
+impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.repr().start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.repr().is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.repr().is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.repr().is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.repr().is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        match *self {
+            SparseDFA::Standard(ref r) => r.next_state(current, input),
+            SparseDFA::ByteClass(ref r) => r.next_state(current, input),
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        self.next_state(current, input)
+    }
+
+    // We specialize the following methods because it lets us lift the
+    // case analysis between the different types of sparse DFAs. Instead of
+    // doing the case analysis for every transition, we do it once before
+    // searching. For sparse DFAs, this doesn't seem to benefit performance as
+    // much as it does for the dense DFAs, but it's easy to do so we might as
+    // well do it.
+
+    #[inline]
+    fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
+        match *self {
+            SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
+            SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        match *self {
+            SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
+            SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        match *self {
+            SparseDFA::Standard(ref r) => r.find_at(bytes, start),
+            SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+
+    #[inline]
+    fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
+        match *self {
+            SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
+            SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
+            SparseDFA::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+/// A standard sparse DFA that does not use premultiplication or byte classes.
+///
+/// Generally, it isn't necessary to use this type directly, since a
+/// `SparseDFA` can be used for searching directly. One possible reason why
+/// one might want to use this type directly is if you are implementing your
+/// own search routines by walking a DFA's transitions directly. In that case,
+/// you'll want to use this type (or any of the other DFA variant types)
+/// directly, since they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
+
+impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.0.start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.0.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.0.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.0.is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.0.is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        self.0.state(current).next(input)
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        self.next_state(current, input)
+    }
+}
+
+/// A sparse DFA that shrinks its alphabet.
+///
+/// Alphabet shrinking is achieved by using a set of equivalence classes
+/// instead of using all possible byte values. Any two bytes belong to the same
+/// equivalence class if and only if they can be used interchangeably anywhere
+/// in the DFA while never discriminating between a match and a non-match.
+///
+/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from
+/// using byte classes. In some cases, using byte classes can even marginally
+/// increase the size of a sparse DFA's transition table. The reason for this
+/// is that a sparse DFA already compacts each state's transitions separate
+/// from whether byte classes are used.
+///
+/// Generally, it isn't necessary to use this type directly, since a
+/// `SparseDFA` can be used for searching directly. One possible reason why
+/// one might want to use this type directly is if you are implementing your
+/// own search routines by walking a DFA's transitions directly. In that case,
+/// you'll want to use this type (or any of the other DFA variant types)
+/// directly, since they implement `next_state` more efficiently.
+#[derive(Clone, Debug)]
+pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
+
+impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
+    type ID = S;
+
+    #[inline]
+    fn start_state(&self) -> S {
+        self.0.start_state()
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: S) -> bool {
+        self.0.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: S) -> bool {
+        self.0.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        self.0.is_match_or_dead_state(id)
+    }
+
+    #[inline]
+    fn is_anchored(&self) -> bool {
+        self.0.is_anchored()
+    }
+
+    #[inline]
+    fn next_state(&self, current: S, input: u8) -> S {
+        let input = self.0.byte_classes.get(input);
+        self.0.state(current).next(input)
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
+        self.next_state(current, input)
+    }
+}
+
+/// The underlying representation of a sparse DFA. This is shared by all of
+/// the different variants of a sparse DFA.
+#[derive(Clone)]
+#[cfg_attr(not(feature = "std"), derive(Debug))]
+struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
+    anchored: bool,
+    start: S,
+    state_count: usize,
+    max_match: S,
+    byte_classes: ByteClasses,
+    trans: T,
+}
+
+impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
+    fn into_sparse_dfa(self) -> SparseDFA<T, S> {
+        if self.byte_classes.is_singleton() {
+            SparseDFA::Standard(Standard(self))
+        } else {
+            SparseDFA::ByteClass(ByteClass(self))
+        }
+    }
+
+    fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
+        Repr {
+            anchored: self.anchored,
+            start: self.start,
+            state_count: self.state_count,
+            max_match: self.max_match,
+            byte_classes: self.byte_classes.clone(),
+            trans: self.trans(),
+        }
+    }
+
+    #[cfg(feature = "std")]
+    fn to_owned(&self) -> Repr<Vec<u8>, S> {
+        Repr {
+            anchored: self.anchored,
+            start: self.start,
+            state_count: self.state_count,
+            max_match: self.max_match,
+            byte_classes: self.byte_classes.clone(),
+            trans: self.trans().to_vec(),
+        }
+    }
+
+    /// Return a convenient representation of the given state.
+    ///
+    /// This is marked as inline because it doesn't seem to get inlined
+    /// otherwise, which leads to a fairly significant performance loss (~25%).
+    #[inline]
+    fn state<'a>(&'a self, id: S) -> State<'a, S> {
+        let mut pos = id.to_usize();
+        let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
+        pos += 2;
+        let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
+        pos += 2 * ntrans;
+        let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
+        State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
+    }
+
+    /// Return an iterator over all of the states in this DFA.
+    ///
+    /// The iterator returned yields tuples, where the first element is the
+    /// state ID and the second element is the state itself.
+    #[cfg(feature = "std")]
+    fn states<'a>(&'a self) -> StateIter<'a, T, S> {
+        StateIter { dfa: self, id: dead_id() }
+    }
+
+    fn memory_usage(&self) -> usize {
+        self.trans().len()
+    }
+
+    fn start_state(&self) -> S {
+        self.start
+    }
+
+    fn is_match_state(&self, id: S) -> bool {
+        self.is_match_or_dead_state(id) && !self.is_dead_state(id)
+    }
+
+    fn is_dead_state(&self, id: S) -> bool {
+        id == dead_id()
+    }
+
+    fn is_match_or_dead_state(&self, id: S) -> bool {
+        id <= self.max_match
+    }
+
+    fn is_anchored(&self) -> bool {
+        self.anchored
+    }
+
+    fn trans(&self) -> &[u8] {
+        self.trans.as_ref()
+    }
+
+    /// Create a new sparse DFA whose match semantics are equivalent to this
+    /// DFA, but attempt to use `A` for the representation of state
+    /// identifiers. If `A` is insufficient to represent all state identifiers
+    /// in this DFA, then this returns an error.
+    #[cfg(feature = "std")]
+    fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
+        // To build the new DFA, we proceed much like the initial construction
+        // of the sparse DFA. Namely, since the state ID size is changing,
+        // we don't actually know all of our state IDs until we've allocated
+        // all necessary space. So we do one pass that allocates all of the
+        // storage we need, and then another pass to fill in the transitions.
+
+        let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
+        let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
+        for (old_id, state) in self.states() {
+            let pos = trans.len();
+            map.insert(old_id, usize_to_state_id(pos)?);
+
+            let n = state.ntrans;
+            let zeros = 2 + (n * 2) + (n * size_of::<A>());
+            trans.extend(iter::repeat(0).take(zeros));
+
+            NativeEndian::write_u16(&mut trans[pos..], n as u16);
+            let (s, e) = (pos + 2, pos + 2 + (n * 2));
+            trans[s..e].copy_from_slice(state.input_ranges);
+        }
+
+        let mut new = Repr {
+            anchored: self.anchored,
+            start: map[&self.start],
+            state_count: self.state_count,
+            max_match: map[&self.max_match],
+            byte_classes: self.byte_classes.clone(),
+            trans,
+        };
+        for (&old_id, &new_id) in map.iter() {
+            let old_state = self.state(old_id);
+            let mut new_state = new.state_mut(new_id);
+            for i in 0..new_state.ntrans {
+                let next = map[&old_state.next_at(i)];
+                new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
+            }
+        }
+        new.start = map[&self.start];
+        new.max_match = map[&self.max_match];
+        Ok(new)
+    }
+
+    /// Serialize a sparse DFA to raw bytes using the provided endianness.
+    ///
+    /// If the state identifier representation of this DFA has a size different
+    /// than 1, 2, 4 or 8 bytes, then this returns an error. All
+    /// implementations of `StateID` provided by this crate satisfy this
+    /// requirement.
+    ///
+    /// Unlike dense DFAs, the result is not necessarily aligned since a
+    /// sparse DFA's transition table is always read as a sequence of bytes.
+    #[cfg(feature = "std")]
+    fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
+        let label = b"rust-regex-automata-sparse-dfa\x00";
+        let size =
+            // For human readable label.
+            label.len()
+            // endiannes check, must be equal to 0xFEFF for native endian
+            + 2
+            // For version number.
+            + 2
+            // Size of state ID representation, in bytes.
+            // Must be 1, 2, 4 or 8.
+            + 2
+            // For DFA misc options. (Currently unused.)
+            + 2
+            // For start state.
+            + 8
+            // For state count.
+            + 8
+            // For max match state.
+            + 8
+            // For byte class map.
+            + 256
+            // For transition table.
+            + self.trans().len();
+
+        let mut i = 0;
+        let mut buf = vec![0; size];
+
+        // write label
+        for &b in label {
+            buf[i] = b;
+            i += 1;
+        }
+        // endianness check
+        A::write_u16(&mut buf[i..], 0xFEFF);
+        i += 2;
+        // version number
+        A::write_u16(&mut buf[i..], 1);
+        i += 2;
+        // size of state ID
+        let state_size = size_of::<S>();
+        if ![1, 2, 4, 8].contains(&state_size) {
+            return Err(Error::serialize(&format!(
+                "state size of {} not supported, must be 1, 2, 4 or 8",
+                state_size
+            )));
+        }
+        A::write_u16(&mut buf[i..], state_size as u16);
+        i += 2;
+        // DFA misc options
+        let mut options = 0u16;
+        if self.anchored {
+            options |= dense::MASK_ANCHORED;
+        }
+        A::write_u16(&mut buf[i..], options);
+        i += 2;
+        // start state
+        A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
+        i += 8;
+        // state count
+        A::write_u64(&mut buf[i..], self.state_count as u64);
+        i += 8;
+        // max match state
+        A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
+        i += 8;
+        // byte class map
+        for b in (0..256).map(|b| b as u8) {
+            buf[i] = self.byte_classes.get(b);
+            i += 1;
+        }
+        // transition table
+        for (_, state) in self.states() {
+            A::write_u16(&mut buf[i..], state.ntrans as u16);
+            i += 2;
+            buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
+            i += state.ntrans * 2;
+            for j in 0..state.ntrans {
+                write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
+                i += size_of::<S>();
+            }
+        }
+
+        assert_eq!(size, i, "expected to consume entire buffer");
+
+        Ok(buf)
+    }
+}
+
+impl<'a, S: StateID> Repr<&'a [u8], S> {
+    /// The implementation for deserializing a sparse DFA from raw bytes.
+    unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
+        // skip over label
+        match buf.iter().position(|&b| b == b'\x00') {
+            None => panic!("could not find label"),
+            Some(i) => buf = &buf[i + 1..],
+        }
+
+        // check that current endianness is same as endianness of DFA
+        let endian_check = NativeEndian::read_u16(buf);
+        buf = &buf[2..];
+        if endian_check != 0xFEFF {
+            panic!(
+                "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
+                 are you trying to load a SparseDFA serialized with a \
+                 different endianness?",
+                endian_check,
+            );
+        }
+
+        // check that the version number is supported
+        let version = NativeEndian::read_u16(buf);
+        buf = &buf[2..];
+        if version != 1 {
+            panic!(
+                "expected version 1, but found unsupported version {}",
+                version,
+            );
+        }
+
+        // read size of state
+        let state_size = NativeEndian::read_u16(buf) as usize;
+        if state_size != size_of::<S>() {
+            panic!(
+                "state size of SparseDFA ({}) does not match \
+                 requested state size ({})",
+                state_size,
+                size_of::<S>(),
+            );
+        }
+        buf = &buf[2..];
+
+        // read miscellaneous options
+        let opts = NativeEndian::read_u16(buf);
+        buf = &buf[2..];
+
+        // read start state
+        let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
+        buf = &buf[8..];
+
+        // read state count
+        let state_count = NativeEndian::read_u64(buf) as usize;
+        buf = &buf[8..];
+
+        // read max match state
+        let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
+        buf = &buf[8..];
+
+        // read byte classes
+        let byte_classes = ByteClasses::from_slice(&buf[..256]);
+        buf = &buf[256..];
+
+        Repr {
+            anchored: opts & dense::MASK_ANCHORED > 0,
+            start,
+            state_count,
+            max_match,
+            byte_classes,
+            trans: buf,
+        }
+    }
+}
+
+#[cfg(feature = "std")]
+impl<S: StateID> Repr<Vec<u8>, S> {
+    /// The implementation for constructing a sparse DFA from a dense DFA.
+    fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
+        dfa: &dense::Repr<T, S>,
+    ) -> Result<Repr<Vec<u8>, A>> {
+        // In order to build the transition table, we need to be able to write
+        // state identifiers for each of the "next" transitions in each state.
+        // Our state identifiers correspond to the byte offset in the
+        // transition table at which the state is encoded. Therefore, we do not
+        // actually know what the state identifiers are until we've allocated
+        // exactly as much space as we need for each state. Thus, construction
+        // of the transition table happens in two passes.
+        //
+        // In the first pass, we fill out the shell of each state, which
+        // includes the transition count, the input byte ranges and zero-filled
+        // space for the transitions. In this first pass, we also build up a
+        // map from the state identifier index of the dense DFA to the state
+        // identifier in this sparse DFA.
+        //
+        // In the second pass, we fill in the transitions based on the map
+        // built in the first pass.
+
+        let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
+        let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
+        for (old_id, state) in dfa.states() {
+            let pos = trans.len();
+
+            remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
+            // zero-filled space for the transition count
+            trans.push(0);
+            trans.push(0);
+
+            let mut trans_count = 0;
+            for (b1, b2, _) in state.sparse_transitions() {
+                trans_count += 1;
+                trans.push(b1);
+                trans.push(b2);
+            }
+            // fill in the transition count
+            NativeEndian::write_u16(&mut trans[pos..], trans_count);
+
+            // zero-fill the actual transitions
+            let zeros = trans_count as usize * size_of::<A>();
+            trans.extend(iter::repeat(0).take(zeros));
+        }
+
+        let mut new = Repr {
+            anchored: dfa.is_anchored(),
+            start: remap[dfa.state_id_to_index(dfa.start_state())],
+            state_count: dfa.state_count(),
+            max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
+            byte_classes: dfa.byte_classes().clone(),
+            trans,
+        };
+        for (old_id, old_state) in dfa.states() {
+            let new_id = remap[dfa.state_id_to_index(old_id)];
+            let mut new_state = new.state_mut(new_id);
+            let sparse = old_state.sparse_transitions();
+            for (i, (_, _, next)) in sparse.enumerate() {
+                let next = remap[dfa.state_id_to_index(next)];
+                new_state.set_next_at(i, next);
+            }
+        }
+        Ok(new)
+    }
+
+    /// Return a convenient mutable representation of the given state.
+    fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
+        let mut pos = id.to_usize();
+        let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
+        pos += 2;
+
+        let size = (ntrans * 2) + (ntrans * size_of::<S>());
+        let ranges_and_next = &mut self.trans[pos..pos + size];
+        let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
+        StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
+    }
+}
+
+#[cfg(feature = "std")]
+impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fn state_status<T: AsRef<[u8]>, S: StateID>(
+            dfa: &Repr<T, S>,
+            id: S,
+        ) -> &'static str {
+            if id == dead_id() {
+                if dfa.is_match_state(id) {
+                    "D*"
+                } else {
+                    "D "
+                }
+            } else if id == dfa.start_state() {
+                if dfa.is_match_state(id) {
+                    ">*"
+                } else {
+                    "> "
+                }
+            } else {
+                if dfa.is_match_state(id) {
+                    " *"
+                } else {
+                    "  "
+                }
+            }
+        }
+
+        writeln!(f, "SparseDFA(")?;
+        for (id, state) in self.states() {
+            let status = state_status(self, id);
+            writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
+        }
+        writeln!(f, ")")?;
+        Ok(())
+    }
+}
+
+/// An iterator over all states in a sparse DFA.
+///
+/// This iterator yields tuples, where the first element is the state ID and
+/// the second element is the state itself.
+#[cfg(feature = "std")]
+#[derive(Debug)]
+struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
+    dfa: &'a Repr<T, S>,
+    id: S,
+}
+
+#[cfg(feature = "std")]
+impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
+    type Item = (S, State<'a, S>);
+
+    fn next(&mut self) -> Option<(S, State<'a, S>)> {
+        if self.id.to_usize() >= self.dfa.trans().len() {
+            return None;
+        }
+        let id = self.id;
+        let state = self.dfa.state(id);
+        self.id = S::from_usize(self.id.to_usize() + state.bytes());
+        Some((id, state))
+    }
+}
+
+/// A representation of a sparse DFA state that can be cheaply materialized
+/// from a state identifier.
+#[derive(Clone)]
+struct State<'a, S: StateID = usize> {
+    /// The state identifier representation used by the DFA from which this
+    /// state was extracted. Since our transition table is compacted in a
+    /// &[u8], we don't actually use the state ID type parameter explicitly
+    /// anywhere, so we fake it. This prevents callers from using an incorrect
+    /// state ID representation to read from this state.
+    _state_id_repr: PhantomData<S>,
+    /// The number of transitions in this state.
+    ntrans: usize,
+    /// Pairs of input ranges, where there is one pair for each transition.
+    /// Each pair specifies an inclusive start and end byte range for the
+    /// corresponding transition.
+    input_ranges: &'a [u8],
+    /// Transitions to the next state. This slice contains native endian
+    /// encoded state identifiers, with `S` as the representation. Thus, there
+    /// are `ntrans * size_of::<S>()` bytes in this slice.
+    next: &'a [u8],
+}
+
+impl<'a, S: StateID> State<'a, S> {
+    /// Searches for the next transition given an input byte. If no such
+    /// transition could be found, then a dead state is returned.
+    fn next(&self, input: u8) -> S {
+        // This straight linear search was observed to be much better than
+        // binary search on ASCII haystacks, likely because a binary search
+        // visits the ASCII case last but a linear search sees it first. A
+        // binary search does do a little better on non-ASCII haystacks, but
+        // not by much. There might be a better trade off lurking here.
+        for i in 0..self.ntrans {
+            let (start, end) = self.range(i);
+            if start <= input && input <= end {
+                return self.next_at(i);
+            }
+            // We could bail early with an extra branch: if input < b1, then
+            // we know we'll never find a matching transition. Interestingly,
+            // this extra branch seems to not help performance, or will even
+            // hurt it. It's likely very dependent on the DFA itself and what
+            // is being searched.
+        }
+        dead_id()
+    }
+
+    /// Returns the inclusive input byte range for the ith transition in this
+    /// state.
+    fn range(&self, i: usize) -> (u8, u8) {
+        (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
+    }
+
+    /// Returns the next state for the ith transition in this state.
+    fn next_at(&self, i: usize) -> S {
+        S::read_bytes(&self.next[i * size_of::<S>()..])
+    }
+
+    /// Return the total number of bytes that this state consumes in its
+    /// encoded form.
+    #[cfg(feature = "std")]
+    fn bytes(&self) -> usize {
+        2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
+    }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for State<'a, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let mut transitions = vec![];
+        for i in 0..self.ntrans {
+            let next = self.next_at(i);
+            if next == dead_id() {
+                continue;
+            }
+
+            let (start, end) = self.range(i);
+            if start == end {
+                transitions.push(format!(
+                    "{} => {}",
+                    escape(start),
+                    next.to_usize()
+                ));
+            } else {
+                transitions.push(format!(
+                    "{}-{} => {}",
+                    escape(start),
+                    escape(end),
+                    next.to_usize(),
+                ));
+            }
+        }
+        write!(f, "{}", transitions.join(", "))
+    }
+}
+
+/// A representation of a mutable sparse DFA state that can be cheaply
+/// materialized from a state identifier.
+#[cfg(feature = "std")]
+struct StateMut<'a, S: StateID = usize> {
+    /// The state identifier representation used by the DFA from which this
+    /// state was extracted. Since our transition table is compacted in a
+    /// &[u8], we don't actually use the state ID type parameter explicitly
+    /// anywhere, so we fake it. This prevents callers from using an incorrect
+    /// state ID representation to read from this state.
+    _state_id_repr: PhantomData<S>,
+    /// The number of transitions in this state.
+    ntrans: usize,
+    /// Pairs of input ranges, where there is one pair for each transition.
+    /// Each pair specifies an inclusive start and end byte range for the
+    /// corresponding transition.
+    input_ranges: &'a mut [u8],
+    /// Transitions to the next state. This slice contains native endian
+    /// encoded state identifiers, with `S` as the representation. Thus, there
+    /// are `ntrans * size_of::<S>()` bytes in this slice.
+    next: &'a mut [u8],
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> StateMut<'a, S> {
+    /// Sets the ith transition to the given state.
+    fn set_next_at(&mut self, i: usize, next: S) {
+        next.write_bytes(&mut self.next[i * size_of::<S>()..]);
+    }
+}
+
+#[cfg(feature = "std")]
+impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let state = State {
+            _state_id_repr: self._state_id_repr,
+            ntrans: self.ntrans,
+            input_ranges: self.input_ranges,
+            next: self.next,
+        };
+        fmt::Debug::fmt(&state, f)
+    }
+}
+
+/// Return the given byte as its escaped string form.
+#[cfg(feature = "std")]
+fn escape(b: u8) -> String {
+    use std::ascii;
+
+    String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
+}
+
+/// A binary search routine specialized specifically to a sparse DFA state's
+/// transitions. Specifically, the transitions are defined as a set of pairs
+/// of input bytes that delineate an inclusive range of bytes. If the input
+/// byte is in the range, then the corresponding transition is a match.
+///
+/// This binary search accepts a slice of these pairs and returns the position
+/// of the matching pair (the ith transition), or None if no matching pair
+/// could be found.
+///
+/// Note that this routine is not currently used since it was observed to
+/// either decrease performance when searching ASCII, or did not provide enough
+/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
+/// for posterity in case we can find a way to use it.
+///
+/// In theory, we could use the standard library's search routine if we could
+/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
+/// guaranteed to be safe and is thus UB (since I don't think the in-memory
+/// representation of `(u8, u8)` has been nailed down).
+#[inline(always)]
+#[allow(dead_code)]
+fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
+    debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
+    debug_assert!(ranges.len() <= 512, "ranges should be short");
+
+    let (mut left, mut right) = (0, ranges.len() / 2);
+    while left < right {
+        let mid = (left + right) / 2;
+        let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
+        if needle < b1 {
+            right = mid;
+        } else if needle > b2 {
+            left = mid + 1;
+        } else {
+            return Some(mid);
+        }
+    }
+    None
+}
diff --git a/src/sparse_set.rs b/src/sparse_set.rs
new file mode 100644
index 0000000..6f145ba
--- /dev/null
+++ b/src/sparse_set.rs
@@ -0,0 +1,60 @@
+use std::slice;
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: http://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse sparse sets, so the initial allocation cost is bareable. However, its
+/// other properties listed above are extremely useful.
+#[derive(Clone, Debug)]
+pub struct SparseSet {
+    /// Dense contains the instruction pointers in the order in which they
+    /// were inserted.
+    dense: Vec<usize>,
+    /// Sparse maps instruction pointers to their location in dense.
+    ///
+    /// An instruction pointer is in the set if and only if
+    /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
+    sparse: Box<[usize]>,
+}
+
+impl SparseSet {
+    pub fn new(size: usize) -> SparseSet {
+        SparseSet {
+            dense: Vec::with_capacity(size),
+            sparse: vec![0; size].into_boxed_slice(),
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.dense.len()
+    }
+
+    pub fn insert(&mut self, value: usize) {
+        let i = self.len();
+        assert!(i < self.dense.capacity());
+        self.dense.push(value);
+        self.sparse[value] = i;
+    }
+
+    pub fn contains(&self, value: usize) -> bool {
+        let i = self.sparse[value];
+        self.dense.get(i) == Some(&value)
+    }
+
+    pub fn clear(&mut self) {
+        self.dense.clear();
+    }
+}
+
+impl<'a> IntoIterator for &'a SparseSet {
+    type Item = &'a usize;
+    type IntoIter = slice::Iter<'a, usize>;
+    fn into_iter(self) -> Self::IntoIter {
+        self.dense.iter()
+    }
+}
diff --git a/src/state_id.rs b/src/state_id.rs
new file mode 100644
index 0000000..c9bac19
--- /dev/null
+++ b/src/state_id.rs
@@ -0,0 +1,291 @@
+use core::fmt::Debug;
+use core::hash::Hash;
+use core::mem::size_of;
+
+use byteorder::{ByteOrder, NativeEndian};
+
+#[cfg(feature = "std")]
+pub use self::std::*;
+
+#[cfg(feature = "std")]
+mod std {
+    use byteorder::ByteOrder;
+    use core::mem::size_of;
+    use error::{Error, Result};
+
+    use super::StateID;
+
+    /// Check that the premultiplication of the given state identifier can
+    /// fit into the representation indicated by `S`. If it cannot, or if it
+    /// overflows `usize` itself, then an error is returned.
+    pub fn premultiply_overflow_error<S: StateID>(
+        last_state: S,
+        alphabet_len: usize,
+    ) -> Result<()> {
+        let requested = match last_state.to_usize().checked_mul(alphabet_len) {
+            Some(requested) => requested,
+            None => return Err(Error::premultiply_overflow(0, 0)),
+        };
+        if requested > S::max_id() {
+            return Err(Error::premultiply_overflow(S::max_id(), requested));
+        }
+        Ok(())
+    }
+
+    /// Allocate the next sequential identifier for a fresh state given
+    /// the previously constructed state identified by `current`. If the
+    /// next sequential identifier would overflow `usize` or the chosen
+    /// representation indicated by `S`, then an error is returned.
+    pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
+        let next = match current.to_usize().checked_add(1) {
+            Some(next) => next,
+            None => return Err(Error::state_id_overflow(::std::usize::MAX)),
+        };
+        if next > S::max_id() {
+            return Err(Error::state_id_overflow(S::max_id()));
+        }
+        Ok(S::from_usize(next))
+    }
+
+    /// Convert the given `usize` to the chosen state identifier
+    /// representation. If the given value cannot fit in the chosen
+    /// representation, then an error is returned.
+    pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
+        if value > S::max_id() {
+            Err(Error::state_id_overflow(S::max_id()))
+        } else {
+            Ok(S::from_usize(value))
+        }
+    }
+
+    /// Write the given identifier to the given slice of bytes using the
+    /// specified endianness. The given slice must have length at least
+    /// `size_of::<S>()`.
+    ///
+    /// The given state identifier representation must have size 1, 2, 4 or 8.
+    pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
+        slice: &mut [u8],
+        id: S,
+    ) {
+        assert!(
+            1 == size_of::<S>()
+                || 2 == size_of::<S>()
+                || 4 == size_of::<S>()
+                || 8 == size_of::<S>()
+        );
+
+        match size_of::<S>() {
+            1 => slice[0] = id.to_usize() as u8,
+            2 => E::write_u16(slice, id.to_usize() as u16),
+            4 => E::write_u32(slice, id.to_usize() as u32),
+            8 => E::write_u64(slice, id.to_usize() as u64),
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// Return the unique identifier for a DFA's dead state in the chosen
+/// representation indicated by `S`.
+pub fn dead_id<S: StateID>() -> S {
+    S::from_usize(0)
+}
+
+/// A trait describing the representation of a DFA's state identifier.
+///
+/// The purpose of this trait is to safely express both the possible state
+/// identifier representations that can be used in a DFA and to convert between
+/// state identifier representations and types that can be used to efficiently
+/// index memory (such as `usize`).
+///
+/// In general, one should not need to implement this trait explicitly. In
+/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
+/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
+/// represent all corresponding values in a `usize`.)
+///
+/// # Safety
+///
+/// This trait is unsafe because the correctness of its implementations may be
+/// relied upon by other unsafe code. For example, one possible way to
+/// implement this trait incorrectly would be to return a maximum identifier
+/// in `max_id` that is greater than the real maximum identifier. This will
+/// likely result in wrap-on-overflow semantics in release mode, which can in
+/// turn produce incorrect state identifiers. Those state identifiers may then
+/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
+/// checks are explicitly elided for performance reasons.
+pub unsafe trait StateID:
+    Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
+{
+    /// Convert from a `usize` to this implementation's representation.
+    ///
+    /// Implementors may assume that `n <= Self::max_id`. That is, implementors
+    /// do not need to check whether `n` can fit inside this implementation's
+    /// representation.
+    fn from_usize(n: usize) -> Self;
+
+    /// Convert this implementation's representation to a `usize`.
+    ///
+    /// Implementors must not return a `usize` value greater than
+    /// `Self::max_id` and must not permit overflow when converting between the
+    /// implementor's representation and `usize`. In general, the preferred
+    /// way for implementors to achieve this is to simply not provide
+    /// implementations of `StateID` that cannot fit into the target platform's
+    /// `usize`.
+    fn to_usize(self) -> usize;
+
+    /// Return the maximum state identifier supported by this representation.
+    ///
+    /// Implementors must return a correct bound. Doing otherwise may result
+    /// in memory unsafety.
+    fn max_id() -> usize;
+
+    /// Read a single state identifier from the given slice of bytes in native
+    /// endian format.
+    ///
+    /// Implementors may assume that the given slice has length at least
+    /// `size_of::<Self>()`.
+    fn read_bytes(slice: &[u8]) -> Self;
+
+    /// Write this state identifier to the given slice of bytes in native
+    /// endian format.
+    ///
+    /// Implementors may assume that the given slice has length at least
+    /// `size_of::<Self>()`.
+    fn write_bytes(self, slice: &mut [u8]);
+}
+
+unsafe impl StateID for usize {
+    #[inline]
+    fn from_usize(n: usize) -> usize {
+        n
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::core::usize::MAX
+    }
+
+    #[inline]
+    fn read_bytes(slice: &[u8]) -> Self {
+        NativeEndian::read_uint(slice, size_of::<usize>()) as usize
+    }
+
+    #[inline]
+    fn write_bytes(self, slice: &mut [u8]) {
+        NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
+    }
+}
+
+unsafe impl StateID for u8 {
+    #[inline]
+    fn from_usize(n: usize) -> u8 {
+        n as u8
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::core::u8::MAX as usize
+    }
+
+    #[inline]
+    fn read_bytes(slice: &[u8]) -> Self {
+        slice[0]
+    }
+
+    #[inline]
+    fn write_bytes(self, slice: &mut [u8]) {
+        slice[0] = self;
+    }
+}
+
+unsafe impl StateID for u16 {
+    #[inline]
+    fn from_usize(n: usize) -> u16 {
+        n as u16
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::core::u16::MAX as usize
+    }
+
+    #[inline]
+    fn read_bytes(slice: &[u8]) -> Self {
+        NativeEndian::read_u16(slice)
+    }
+
+    #[inline]
+    fn write_bytes(self, slice: &mut [u8]) {
+        NativeEndian::write_u16(slice, self)
+    }
+}
+
+#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+unsafe impl StateID for u32 {
+    #[inline]
+    fn from_usize(n: usize) -> u32 {
+        n as u32
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::core::u32::MAX as usize
+    }
+
+    #[inline]
+    fn read_bytes(slice: &[u8]) -> Self {
+        NativeEndian::read_u32(slice)
+    }
+
+    #[inline]
+    fn write_bytes(self, slice: &mut [u8]) {
+        NativeEndian::write_u32(slice, self)
+    }
+}
+
+#[cfg(target_pointer_width = "64")]
+unsafe impl StateID for u64 {
+    #[inline]
+    fn from_usize(n: usize) -> u64 {
+        n as u64
+    }
+
+    #[inline]
+    fn to_usize(self) -> usize {
+        self as usize
+    }
+
+    #[inline]
+    fn max_id() -> usize {
+        ::core::u64::MAX as usize
+    }
+
+    #[inline]
+    fn read_bytes(slice: &[u8]) -> Self {
+        NativeEndian::read_u64(slice)
+    }
+
+    #[inline]
+    fn write_bytes(self, slice: &mut [u8]) {
+        NativeEndian::write_u64(slice, self)
+    }
+}
diff --git a/src/transducer.rs b/src/transducer.rs
new file mode 100644
index 0000000..679c757
--- /dev/null
+++ b/src/transducer.rs
@@ -0,0 +1,107 @@
+use fst::Automaton;
+
+use crate::{StateID, DFA};
+
+macro_rules! imp {
+    ($ty:ty, $id:ty) => {
+        impl<T: AsRef<[$id]>, S: StateID> Automaton for $ty {
+            type State = S;
+
+            #[inline]
+            fn start(&self) -> S {
+                self.start_state()
+            }
+
+            #[inline]
+            fn is_match(&self, state: &S) -> bool {
+                self.is_match_state(*state)
+            }
+
+            #[inline]
+            fn accept(&self, state: &S, byte: u8) -> S {
+                self.next_state(*state, byte)
+            }
+
+            #[inline]
+            fn can_match(&self, state: &S) -> bool {
+                !self.is_dead_state(*state)
+            }
+        }
+    };
+}
+
+imp!(crate::dense::DenseDFA<T, S>, S);
+imp!(crate::dense::Standard<T, S>, S);
+imp!(crate::dense::ByteClass<T, S>, S);
+imp!(crate::dense::Premultiplied<T, S>, S);
+imp!(crate::dense::PremultipliedByteClass<T, S>, S);
+imp!(crate::sparse::SparseDFA<T, S>, u8);
+imp!(crate::sparse::Standard<T, S>, u8);
+imp!(crate::sparse::ByteClass<T, S>, u8);
+
+#[cfg(test)]
+mod tests {
+    use bstr::BString;
+    use fst::{Automaton, IntoStreamer, Set, Streamer};
+
+    use crate::dense::{self, DenseDFA};
+    use crate::sparse::SparseDFA;
+
+    fn search<A: Automaton, D: AsRef<[u8]>>(
+        set: &Set<D>,
+        aut: A,
+    ) -> Vec<BString> {
+        let mut stream = set.search(aut).into_stream();
+
+        let mut results = vec![];
+        while let Some(key) = stream.next() {
+            results.push(BString::from(key));
+        }
+        results
+    }
+
+    #[test]
+    fn dense_anywhere() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = DenseDFA::new("ba.*").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+    }
+
+    #[test]
+    fn dense_anchored() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::Builder::new().anchored(true).build("ba.*").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz"]);
+    }
+
+    #[test]
+    fn sparse_anywhere() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = SparseDFA::new("ba.*").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+    }
+
+    #[test]
+    fn sparse_anchored() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::Builder::new()
+            .anchored(true)
+            .build("ba.*")
+            .unwrap()
+            .to_sparse()
+            .unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz"]);
+    }
+}
