Parse based on proc-macro2

commit: 416724e44537bad521ef7fc8aad7fbcb4b31be9b [log] [tgz]
author: Michael Layzell <michael@thelayzells.com> Wed May 24 21:12:34 2017 -0400
committer: Michael Layzell <michael@thelayzells.com> Wed May 24 21:12:34 2017 -0400
tree: e7b7629c5bc9acf90dbfe5e1ff4b68ca233e54c9
parent: 56c55705b5625814902df4b5f2a56034ea95737c [diff]
diff --git a/synom/Cargo.toml b/synom/Cargo.toml
index fde5ed4..5ef7c0f 100644
--- a/synom/Cargo.toml
+++ b/synom/Cargo.toml

@@ -1,6 +1,6 @@
 [package]
 name = "synom"
-version = "0.11.3"
+version = "0.12.0"
 authors = ["David Tolnay <dtolnay@gmail.com>"]
 license = "MIT/Apache-2.0"
 description = "Stripped-down Nom parser used by Syn"
@@ -11,10 +11,10 @@
 include = ["Cargo.toml", "src/**/*.rs", "README.md", "LICENSE-APACHE", "LICENSE-MIT"]
 
 [dependencies]
-unicode-xid = "0.0.4"
+proc-macro2 = { git = "https://github.com/mystor/proc-macro2", branch = "byte_literal" }
 
 [dev-dependencies.syn]
-version = "0.11"
+version = "0.12"
 path = ".."
 features = ["parsing", "full"]
 default-features = false

diff --git a/synom/src/helper.rs b/synom/src/helper.rs
index a488359..800874a 100644
--- a/synom/src/helper.rs
+++ b/synom/src/helper.rs

@@ -1,5 +1,4 @@
-use IResult;
-use space::{skip_whitespace, word_break};
+use {IResult, TokenTree, TokenKind, OpKind, Delimiter, InputBuf};
 
 /// Parse a piece of punctuation like "+" or "+=".
 ///
@@ -33,13 +32,33 @@
 
 // Not public API.
 #[doc(hidden)]
-pub fn punct<'a>(input: &'a str, token: &'static str) -> IResult<&'a str, &'a str> {
-    let input = skip_whitespace(input);
-    if input.starts_with(token) {
-        IResult::Done(&input[token.len()..], token)
-    } else {
-        IResult::Error
+pub fn punct<'a>(input: &'a [TokenTree], token: &'static str) -> IResult<&'a [TokenTree], &'a str> {
+    // Extract the chars from token, so we know how many tokens to expect, check
+    // if we are running past EOF, then confirm that the tokens exist as
+    // requested.
+    let expected = token.chars().collect::<Vec<_>>();
+    if input.len() < expected.len() {
+        return IResult::Error;
     }
+    for i in 0..expected.len() {
+        if let TokenKind::Op(c, ok) = input[i].kind {
+            if c != expected[i] {
+                return IResult::Error;
+            }
+
+            // The last token in the sequence does not have to be marked as
+            // OpKind::Joint. Unfortunately OpKind doesn't implement
+            // Eq/PartialEq right now.
+            match ok {
+                OpKind::Alone if i != expected.len() - 1 => return IResult::Error,
+                _ => {}
+            }
+        } else {
+            return IResult::Error;
+        }
+    }
+
+    IResult::Done(&input[expected.len()..], token)
 }
 
 /// Parse a keyword like "fn" or "struct".
@@ -83,15 +102,11 @@
 
 // Not public API.
 #[doc(hidden)]
-pub fn keyword<'a>(input: &'a str, token: &'static str) -> IResult<&'a str, &'a str> {
-    match punct(input, token) {
-        IResult::Done(rest, _) => {
-            match word_break(rest) {
-                IResult::Done(_, _) => IResult::Done(rest, token),
-                IResult::Error => IResult::Error,
-            }
-        }
-        IResult::Error => IResult::Error,
+pub fn keyword<'a>(input: &'a [TokenTree], token: &'static str) -> IResult<&'a [TokenTree], &'static str> {
+    match input.first() {
+        Some(&TokenTree{ kind: TokenKind::Word(ref symbol), .. }) if &**symbol == token =>
+            IResult::Done(&input[1..], token),
+        _ => IResult::Error,
     }
 }
 
@@ -497,11 +512,11 @@
 
 // Not public API.
 #[doc(hidden)]
-pub fn separated_list<'a, T>(mut input: &'a str,
+pub fn separated_list<'a, T>(mut input: &'a [TokenTree],
                              sep: &'static str,
-                             f: fn(&'a str) -> IResult<&'a str, T>,
+                             f: fn(&'a [TokenTree]) -> IResult<&'a [TokenTree], T>,
                              terminated: bool)
-                             -> IResult<&'a str, Vec<T>> {
+                             -> IResult<&'a [TokenTree], Vec<T>> {
     let mut res = Vec::new();
 
     // get the first element
@@ -541,3 +556,45 @@
         }
     }
 }
+
+#[macro_export]
+macro_rules! delim {
+    ($i:expr, $delim:ident, $fmac:ident!( $($fargs:tt)* )) => {
+        match $crate::helper::delim_impl($i, $crate::Delimiter::$delim) {
+            Some((i, ib)) => {
+                match $fmac!(&*ib, $($fargs)*) {
+                    $crate::IResult::Done(rest, val) => {
+                        if rest.is_empty() {
+                            $crate::IResult::Done(i, val)
+                        } else {
+                            $crate::IResult::Error
+                        }
+                    }
+                    _ => $crate::IResult::Error,
+                }
+            }
+            _ => $crate::IResult::Error,
+        }
+    };
+    ($i:expr, $delim:ident, $f:expr) => {
+        delim!($i, $delim, call!($f))
+    };
+}
+
+// Not a public API
+#[doc(hidden)]
+pub fn delim_impl(input: &[TokenTree],
+                  expected_delim: Delimiter)
+                  -> Option<(&[TokenTree], InputBuf)> {
+    // NOTE: The `as u32` hack is being used as `Delimiter` doesn't implement
+    // `PartialEq` or `Eq` despite being a simple c-style enum.
+    match input.first() {
+        Some(&TokenTree {
+            kind: TokenKind::Sequence(delim, ref stream),
+            ..
+        }) if delim as u32 == expected_delim as u32 => {
+            Some((&input[1..], InputBuf::new(stream.clone())))
+        }
+        _ => None
+    }
+}

diff --git a/synom/src/lib.rs b/synom/src/lib.rs
index 97c7833..ae9aebd 100644
--- a/synom/src/lib.rs
+++ b/synom/src/lib.rs

@@ -21,14 +21,51 @@
 //! For our use case, this strategy is a huge improvement in usability,
 //! correctness, and compile time over nom's `ws!` strategy.
 
-extern crate unicode_xid;
-
-#[doc(hidden)]
-pub mod space;
+extern crate proc_macro2;
 
 #[doc(hidden)]
 pub mod helper;
 
+// re-export TokenStream et. al. from proc_macro2, so that parsers which want to
+// use us don't have to manually import the type.
+pub use proc_macro2::{TokenStream, TokenTree, TokenKind, Delimiter, OpKind, LexError};
+
+use std::ops::Deref;
+
+/// A `TokenStream` does not provide a data format which is usable as a `synom`
+/// parser input. This type extracts `TokenTrees` from a `TokenStream` into a
+/// buffer, which can be iterated over as the `synom` input type.
+pub struct InputBuf {
+    data: Vec<TokenTree>,
+}
+
+impl InputBuf {
+    /// Transform the input `TokenStream` into a buffer which can be iterated
+    /// over as a `synom` parser input. Use the `Deref` implementation on this
+    /// type to extract the actual buffer type.
+    pub fn new(ts: TokenStream) -> Self {
+        fn flatten_stream(tt: TokenTree) -> Vec<TokenTree> {
+            match tt.kind {
+                TokenKind::Sequence(Delimiter::None, ts) => {
+                    ts.into_iter().flat_map(flatten_stream).collect()
+                }
+                _ => vec![tt]
+            }
+        }
+
+        InputBuf {
+            data: ts.into_iter().flat_map(flatten_stream).collect()
+        }
+    }
+}
+
+impl Deref for InputBuf {
+    type Target = [TokenTree];
+    fn deref(&self) -> &[TokenTree] {
+        &self.data
+    }
+}
+
 /// The result of a parser.
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub enum IResult<I, O> {
@@ -39,7 +76,7 @@
     Error,
 }
 
-impl<'a, O> IResult<&'a str, O> {
+impl<'a, O> IResult<&'a [TokenTree], O> {
     /// Unwraps the result, asserting the the parse is complete. Panics with a
     /// message based on the given string if the parse failed or is incomplete.
     ///
@@ -66,12 +103,11 @@
     /// ```
     pub fn expect(self, name: &str) -> O {
         match self {
-            IResult::Done(mut rest, o) => {
-                rest = space::skip_whitespace(rest);
+            IResult::Done(rest, o) => {
                 if rest.is_empty() {
                     o
                 } else {
-                    panic!("unparsed tokens after {}: {:?}", name, rest)
+                    panic!("unparsed tokens after {}: {:?}", name, /* rest */ ())
                 }
             }
             IResult::Error => panic!("failed to parse {}", name),
@@ -97,13 +133,13 @@
 #[macro_export]
 macro_rules! named {
     ($name:ident -> $o:ty, $submac:ident!( $($args:tt)* )) => {
-        fn $name(i: &str) -> $crate::IResult<&str, $o> {
+        fn $name(i: &[$crate::TokenTree]) -> $crate::IResult<&[$crate::TokenTree], $o> {
             $submac!(i, $($args)*)
         }
     };
 
     (pub $name:ident -> $o:ty, $submac:ident!( $($args:tt)* )) => {
-        pub fn $name(i: &str) -> $crate::IResult<&str, $o> {
+        pub fn $name(i: &[$crate::TokenTree]) -> $crate::IResult<&[$crate::TokenTree], $o> {
             $submac!(i, $($args)*)
         }
     };
@@ -564,9 +600,9 @@
 //
 // Not public API.
 #[doc(hidden)]
-pub fn many0<'a, T>(mut input: &'a str,
-                    f: fn(&'a str) -> IResult<&'a str, T>)
-                    -> IResult<&'a str, Vec<T>> {
+pub fn many0<'a, T>(mut input: &'a [TokenTree],
+                    f: fn(&'a [TokenTree]) -> IResult<&'a [TokenTree], T>)
+                    -> IResult<&'a [TokenTree], Vec<T>> {
     let mut res = Vec::new();
 
     loop {
@@ -691,54 +727,6 @@
     }};
 }
 
-/// Parse the given string from exactly the current position in the input. You
-/// almost always want `punct!` or `keyword!` instead of this.
-///
-/// The `tag!` parser is equivalent to `punct!` but does not ignore leading
-/// whitespace. Both `punct!` and `keyword!` skip over leading whitespace. See
-/// an explanation of synom's whitespace handling strategy in the top-level
-/// crate documentation.
-///
-/// - **Syntax:** `tag!("...")`
-/// - **Output:** `"..."`
-///
-/// ```rust
-/// extern crate syn;
-/// #[macro_use] extern crate synom;
-///
-/// use syn::StrLit;
-/// use syn::parse::string;
-/// use synom::IResult;
-///
-/// // Parse a proposed syntax for an owned string literal: "abc"s
-/// named!(owned_string -> String,
-///     map!(
-///         terminated!(string, tag!("s")),
-///         |lit: StrLit| lit.value
-///     )
-/// );
-///
-/// fn main() {
-///     let input = r#"  "abc"s  "#;
-///     let parsed = owned_string(input).expect("owned string literal");
-///     println!("{:?}", parsed);
-///
-///     let input = r#"  "abc" s  "#;
-///     let err = owned_string(input);
-///     assert_eq!(err, IResult::Error);
-/// }
-/// ```
-#[macro_export]
-macro_rules! tag {
-    ($i:expr, $tag:expr) => {
-        if $i.starts_with($tag) {
-            $crate::IResult::Done(&$i[$tag.len()..], &$i[..$tag.len()])
-        } else {
-            $crate::IResult::Error
-        }
-    };
-}
-
 /// Pattern-match the result of a parser to select which other parser to run.
 ///
 /// - **Syntax:** `switch!(TARGET, PAT1 => THEN1 | PAT2 => THEN2 | ...)`
@@ -1223,3 +1211,20 @@
         }
     };
 }
+
+#[macro_export]
+macro_rules! input_end {
+    ($i:expr,) => {
+        $crate::input_end($i)
+    };
+}
+
+// Not a public API
+#[doc(hidden)]
+pub fn input_end(input: &[TokenTree]) -> IResult<&'static [TokenTree], &'static str> {
+    if input.is_empty() {
+        IResult::Done(&[], "")
+    } else {
+        IResult::Error
+    }
+}

diff --git a/synom/src/space.rs b/synom/src/space.rs
deleted file mode 100644
index 5237522..0000000
--- a/synom/src/space.rs
+++ /dev/null

@@ -1,99 +0,0 @@
-use IResult;
-use unicode_xid::UnicodeXID;
-
-pub fn whitespace(input: &str) -> IResult<&str, ()> {
-    if input.is_empty() {
-        return IResult::Error;
-    }
-
-    let bytes = input.as_bytes();
-    let mut i = 0;
-    while i < bytes.len() {
-        let s = &input[i..];
-        if bytes[i] == b'/' {
-            if s.starts_with("//") && (!s.starts_with("///") || s.starts_with("////")) &&
-               !s.starts_with("//!") {
-                if let Some(len) = s.find('\n') {
-                    i += len + 1;
-                    continue;
-                }
-                break;
-            } else if s.starts_with("/*") && (!s.starts_with("/**") || s.starts_with("/***")) &&
-                      !s.starts_with("/*!") {
-                match block_comment(s) {
-                    IResult::Done(_, com) => {
-                        i += com.len();
-                        continue;
-                    }
-                    IResult::Error => {
-                        return IResult::Error;
-                    }
-                }
-            }
-        }
-        match bytes[i] {
-            b' ' | 0x09...0x0d => {
-                i += 1;
-                continue;
-            }
-            b if b <= 0x7f => {}
-            _ => {
-                let ch = s.chars().next().unwrap();
-                if is_whitespace(ch) {
-                    i += ch.len_utf8();
-                    continue;
-                }
-            }
-        }
-        return if i > 0 {
-            IResult::Done(s, ())
-        } else {
-            IResult::Error
-        };
-    }
-    IResult::Done("", ())
-}
-
-pub fn block_comment(input: &str) -> IResult<&str, &str> {
-    if !input.starts_with("/*") {
-        return IResult::Error;
-    }
-
-    let mut depth = 0;
-    let bytes = input.as_bytes();
-    let mut i = 0;
-    let upper = bytes.len() - 1;
-    while i < upper {
-        if bytes[i] == b'/' && bytes[i + 1] == b'*' {
-            depth += 1;
-            i += 1; // eat '*'
-        } else if bytes[i] == b'*' && bytes[i + 1] == b'/' {
-            depth -= 1;
-            if depth == 0 {
-                return IResult::Done(&input[i + 2..], &input[..i + 2]);
-            }
-            i += 1; // eat '/'
-        }
-        i += 1;
-    }
-    IResult::Error
-}
-
-pub fn word_break(input: &str) -> IResult<&str, ()> {
-    match input.chars().next() {
-        Some(ch) if UnicodeXID::is_xid_continue(ch) => IResult::Error,
-        Some(_) | None => IResult::Done(input, ()),
-    }
-}
-
-pub fn skip_whitespace(input: &str) -> &str {
-    match whitespace(input) {
-        IResult::Done(rest, _) => rest,
-        IResult::Error => input,
-    }
-}
-
-fn is_whitespace(ch: char) -> bool {
-    // Rust treats left-to-right mark and right-to-left mark as whitespace
-    ch.is_whitespace() || ch == '\u{200e}' || ch == '\u{200f}'
-}
commit	416724e44537bad521ef7fc8aad7fbcb4b31be9b	[log] [tgz]
author	Michael Layzell <michael@thelayzells.com>	Wed May 24 21:12:34 2017 -0400
committer	Michael Layzell <michael@thelayzells.com>	Wed May 24 21:12:34 2017 -0400
tree	e7b7629c5bc9acf90dbfe5e1ff4b68ca233e54c9
parent	56c55705b5625814902df4b5f2a56034ea95737c [diff]