fix!: tokenize keywords and identifiers correctly (#428)

Merge pull request #428 from rigetti/427-keyword-tokenization
rigetti · Jan 10, 2025 · 74c2c6b · 74c2c6b
2 parents 51c6a66 + 4a65159
commit 74c2c6b
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 180 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/quil-rs/proptest-regressions/expression/mod.txt b/quil-rs/proptest-regressions/expression/mod.txt
@@ -6,3 +6,5 @@
 # everyone who runs the test benefits from these saved cases.
 cc 4c32128d724ed0f840715fae4e194c99262dc153c64be39d2acf45b8903b20f7 # shrinks to value = Complex { re: 0.0, im: -0.13530277317700273 }
 cc 5cc95f2159ad7120bbaf296d3a9fb26fef30f57b61e76b3e0dc99f4759009fdb # shrinks to e = Number(Complex { re: 0.0, im: -2.772221265116396 })
+cc de70a1853ccef983fac85a87761ba08bfb2d54b2d4e880d5d90e7b4a75ecafb5 # shrinks to e = Address(MemoryReference { name: "mut", index: 0 })
+cc 9ad50859b68cb403ce1a67af0feef1f55d25587466878e364ba2810be5910b14 # shrinks to e = Address(MemoryReference { name: "iNf", index: 0 })
diff --git a/quil-rs/src/parser/lexer/mod.rs b/quil-rs/src/parser/lexer/mod.rs
@@ -16,8 +16,10 @@ mod error;
 mod quoted_strings;
 mod wrapped_parsers;
 
+use std::str::FromStr;
+
 use nom::{
-    bytes::complete::{is_a, tag_no_case, take_till, take_while, take_while1},
+    bytes::complete::{is_a, take_till, take_while, take_while1},
     character::complete::{digit1, one_of},
     combinator::{all_consuming, map, recognize, value},
     multi::many0,
@@ -28,7 +30,7 @@ use nom::{
 use nom_locate::LocatedSpan;
 use wrapped_parsers::{alt, tag};
 
-pub use super::token::{Token, TokenWithLocation};
+pub use super::token::{KeywordToken, Token, TokenWithLocation};
 use crate::parser::lexer::wrapped_parsers::expecting;
 use crate::parser::token::token_with_location;
 pub(crate) use error::InternalLexError;
@@ -92,7 +94,7 @@ pub enum Command {
     Xor,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
+#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
 #[strum(serialize_all = "UPPERCASE")]
 pub enum DataType {
     Bit,
@@ -101,7 +103,7 @@ pub enum DataType {
     Integer,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
+#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
 #[strum(serialize_all = "UPPERCASE")]
 pub enum Modifier {
     Controlled,
@@ -163,30 +165,17 @@ fn lex_token(input: LexInput) -> InternalLexResult<TokenWithLocation> {
         "a token",
         (
             token_with_location(lex_comment),
-            token_with_location(lex_data_type),
-            token_with_location(lex_modifier),
             token_with_location(lex_punctuation),
             token_with_location(lex_target),
             token_with_location(lex_string),
             // Operator must come before number (or it may be parsed as a prefix)
             token_with_location(lex_operator),
-            token_with_location(lex_number),
             token_with_location(lex_variable),
-            token_with_location(lex_non_blocking),
-            // This should come last because it's sort of a catch all
-            token_with_location(lex_command_or_identifier),
-        ),
-    )(input)
-}
-
-fn lex_data_type(input: LexInput) -> InternalLexResult {
-    alt(
-        "a data type",
-        (
-            value(Token::DataType(DataType::Bit), tag("BIT")),
-            value(Token::DataType(DataType::Integer), tag("INTEGER")),
-            value(Token::DataType(DataType::Octet), tag("OCTET")),
-            value(Token::DataType(DataType::Real), tag("REAL")),
+            // Identifiers must come before numbers so that `NaN`, `Inf`, and `Infinity` aren't
+            // parsed as floats; Nom, as of version 7.1.1, will parse those strings,
+            // case-insensitively, as floats
+            token_with_location(lex_keyword_or_identifier),
+            token_with_location(lex_number),
         ),
     )(input)
 }
@@ -197,62 +186,16 @@ fn lex_comment(input: LexInput) -> InternalLexResult {
     Ok((input, Token::Comment(content.to_string())))
 }
 
-/// If the given identifier string matches a command keyword, return the keyword;
-/// otherwise, return the original identifier as a token.
-fn recognize_command_or_identifier(identifier: String) -> Token {
-    use Command::*;
-
-    match identifier.as_str() {
-        "DEFGATE" => Token::Command(DefGate),
-        "ADD" => Token::Command(Add),
-        "AND" => Token::Command(And),
-        "CALL" => Token::Command(Call),
-        "CONVERT" => Token::Command(Convert),
-        "DIV" => Token::Command(Div),
-        "EQ" => Token::Command(Eq),
-        "EXCHANGE" => Token::Command(Exchange),
-        "GE" => Token::Command(GE),
-        "GT" => Token::Command(GT),
-        "IOR" => Token::Command(Ior),
-        "LE" => Token::Command(LE),
-        "LOAD" => Token::Command(Load),
-        "LT" => Token::Command(LT),
-        "MOVE" => Token::Command(Move),
-        "MUL" => Token::Command(Mul),
-        "NEG" => Token::Command(Neg),
-        "NOT" => Token::Command(Not),
-        "STORE" => Token::Command(Store),
-        "SUB" => Token::Command(Sub),
-        "XOR" => Token::Command(Xor),
-        "DEFCIRCUIT" => Token::Command(DefCircuit),
-        "MEASURE" => Token::Command(Measure),
-        "HALT" => Token::Command(Halt),
-        "WAIT" => Token::Command(Wait),
-        "JUMP-WHEN" => Token::Command(JumpWhen),
-        "JUMP-UNLESS" => Token::Command(JumpUnless),
-        "JUMP" => Token::Command(Jump),
-        "RESET" => Token::Command(Reset),
-        "NOP" => Token::Command(Nop),
-        "INCLUDE" => Token::Command(Include),
-        "PRAGMA" => Token::Command(Pragma),
-        "DECLARE" => Token::Command(Declare),
-        "CAPTURE" => Token::Command(Capture),
-        "DEFCAL" => Token::Command(DefCal),
-        "DEFFRAME" => Token::Command(DefFrame),
-        "DEFWAVEFORM" => Token::Command(DefWaveform),
-        "DELAY" => Token::Command(Delay),
-        "FENCE" => Token::Command(Fence),
-        "PULSE" => Token::Command(Pulse),
-        "RAW-CAPTURE" => Token::Command(RawCapture),
-        "SET-FREQUENCY" => Token::Command(SetFrequency),
-        "SET-PHASE" => Token::Command(SetPhase),
-        "SET-SCALE" => Token::Command(SetScale),
-        "SWAP-PHASES" => Token::Command(SwapPhases),
-        "SHIFT-FREQUENCY" => Token::Command(ShiftFrequency),
-        "SHIFT-PHASE" => Token::Command(ShiftPhase),
-        "LABEL" => Token::Command(Label),
-        _ => Token::Identifier(identifier),
+fn keyword_or_identifier(identifier: String) -> Token {
+    fn parse<T: FromStr>(token: impl Fn(T) -> Token, identifier: &str) -> Result<Token, T::Err> {
+        T::from_str(identifier).map(token)
     }
+
+    parse(KeywordToken::into, &identifier)
+        .or_else(|_| parse(Token::Command, &identifier))
+        .or_else(|_| parse(Token::DataType, &identifier))
+        .or_else(|_| parse(Token::Modifier, &identifier))
+        .unwrap_or(Token::Identifier(identifier))
 }
 
 fn is_valid_identifier_leading_character(chr: char) -> bool {
@@ -286,9 +229,9 @@ fn lex_identifier_raw(input: LexInput) -> InternalLexResult<String> {
     )(input)
 }
 
-fn lex_command_or_identifier(input: LexInput) -> InternalLexResult {
+fn lex_keyword_or_identifier(input: LexInput) -> InternalLexResult {
     let (input, identifier) = lex_identifier_raw(input)?;
-    let token = recognize_command_or_identifier(identifier);
+    let token = keyword_or_identifier(identifier);
     Ok((input, token))
 }
 
@@ -298,10 +241,6 @@ fn lex_target(input: LexInput) -> InternalLexResult {
     Ok((input, Token::Target(label)))
 }
 
-fn lex_non_blocking(input: LexInput) -> InternalLexResult {
-    value(Token::NonBlocking, tag("NONBLOCKING"))(input)
-}
-
 fn lex_number(input: LexInput) -> InternalLexResult {
     let (input, float_string): (LexInput, LexInput) = recognize(double)(input)?;
     let integer_parse_result: IResult<LexInput, _> = all_consuming(digit1)(float_string);
@@ -318,24 +257,6 @@ fn lex_number(input: LexInput) -> InternalLexResult {
     ))
 }
 
-fn lex_modifier(input: LexInput) -> InternalLexResult {
-    alt(
-        "a modifier token",
-        (
-            value(Token::As, tag("AS")),
-            value(Token::Matrix, tag("MATRIX")),
-            value(Token::Modifier(Modifier::Controlled), tag("CONTROLLED")),
-            value(Token::Modifier(Modifier::Dagger), tag("DAGGER")),
-            value(Token::Modifier(Modifier::Forked), tag("FORKED")),
-            value(Token::Mutable, tag_no_case("MUT")),
-            value(Token::Offset, tag("OFFSET")),
-            value(Token::PauliSum, tag("PAULI-SUM")),
-            value(Token::Permutation, tag("PERMUTATION")),
-            value(Token::Sharing, tag("SHARING")),
-        ),
-    )(input)
-}
-
 fn lex_operator(input: LexInput) -> InternalLexResult {
     use Operator::*;
     map(
@@ -399,7 +320,7 @@ mod tests {
     use nom_locate::LocatedSpan;
     use rstest::*;
 
-    use crate::parser::common::tests::KITCHEN_SINK_QUIL;
+    use crate::parser::{common::tests::KITCHEN_SINK_QUIL, DataType};
 
     use super::{lex, Command, Operator, Token};
 
@@ -565,10 +486,29 @@ mod tests {
         case("a", vec![Token::Identifier("a".to_string())]),
         case("_a-2_b-2_", vec![Token::Identifier("_a-2_b-2_".to_string())]),
         case("a-2-%var", vec![
-                Token::Identifier("a-2".to_string()),
-                Token::Operator(Operator::Minus),
-                Token::Variable("var".to_string())
-        ])
+            Token::Identifier("a-2".to_string()),
+            Token::Operator(Operator::Minus),
+            Token::Variable("var".to_string())
+        ]),
+        case("BIT", vec![Token::DataType(DataType::Bit)]),
+        case("BITS", vec![Token::Identifier("BITS".to_string())]),
+        case("NaN", vec![Token::Identifier("NaN".to_string())]),
+        case("nan", vec![Token::Identifier("nan".to_string())]),
+        case("NaNa", vec![Token::Identifier("NaNa".to_string())]),
+        case("nana", vec![Token::Identifier("nana".to_string())]),
+        case("INF", vec![Token::Identifier("INF".to_string())]),
+        case("Infinity", vec![Token::Identifier("Infinity".to_string())]),
+        case("Inferior", vec![Token::Identifier("Inferior".to_string())]),
+        case("-NaN", vec![Token::Operator(Operator::Minus), Token::Identifier("NaN".to_string())]),
+        case("-inf", vec![Token::Operator(Operator::Minus), Token::Identifier("inf".to_string())]),
+        case("-Infinity", vec![
+            Token::Operator(Operator::Minus),
+            Token::Identifier("Infinity".to_string())
+        ]),
+        case("-inferior", vec![
+            Token::Operator(Operator::Minus),
+            Token::Identifier("inferior".to_string())
+        ]),
     )]
     fn it_lexes_identifier(input: &str, expected: Vec<Token>) {
         let input = LocatedSpan::new(input);

diff --git a/quil-rs/src/parser/mod.rs b/quil-rs/src/parser/mod.rs
@@ -31,8 +31,8 @@ mod token;
 
 pub(crate) use error::{ErrorInput, InternalParseError};
 pub use error::{ParseError, ParserErrorKind};
-pub use lexer::LexError;
-pub use token::{Token, TokenWithLocation};
+pub use lexer::{Command, DataType, LexError, Modifier};
+pub use token::{KeywordToken, Token, TokenWithLocation};
 
 pub(crate) type ParserInput<'a> = &'a [TokenWithLocation<'a>];
 type InternalParserResult<'a, R, E = InternalParseError<'a>> = IResult<ParserInput<'a>, R, E>;

diff --git a/quil-rs/src/parser/token.rs b/quil-rs/src/parser/token.rs
@@ -67,6 +67,79 @@ where
     }
 }
 
+/// The subset of [`Token`]s which (a) do not have arguments and (b) are keywords.  Used to ensure
+/// that keyword-checking remains in sync with the definition of [`Token`].
+#[derive(Debug, Copy, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
+#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
+pub enum KeywordToken {
+    As,
+    Matrix,
+    #[strum(serialize = "mut")]
+    Mutable,
+    #[strum(serialize = "NONBLOCKING")]
+    NonBlocking,
+    Offset,
+    PauliSum,
+    Permutation,
+    Sharing,
+}
+
+impl From<KeywordToken> for Token {
+    fn from(token: KeywordToken) -> Self {
+        match token {
+            KeywordToken::As => Token::As,
+            KeywordToken::Matrix => Token::Matrix,
+            KeywordToken::Mutable => Token::Mutable,
+            KeywordToken::NonBlocking => Token::NonBlocking,
+            KeywordToken::Offset => Token::Offset,
+            KeywordToken::PauliSum => Token::PauliSum,
+            KeywordToken::Permutation => Token::Permutation,
+            KeywordToken::Sharing => Token::Sharing,
+        }
+    }
+}
+
+impl TryFrom<Token> for KeywordToken {
+    type Error = ();
+
+    fn try_from(token: Token) -> Result<Self, Self::Error> {
+        // This match is explicit so that if you add a new [`Token`] constructor you have to decide
+        // if it's a keyword.
+        #[deny(clippy::wildcard_enum_match_arm, clippy::wildcard_in_or_patterns)]
+        match token {
+            Token::As => Ok(KeywordToken::As),
+            Token::Matrix => Ok(KeywordToken::Matrix),
+            Token::Mutable => Ok(KeywordToken::Mutable),
+            Token::Offset => Ok(KeywordToken::Offset),
+            Token::PauliSum => Ok(KeywordToken::PauliSum),
+            Token::Permutation => Ok(KeywordToken::Permutation),
+            Token::Sharing => Ok(KeywordToken::Sharing),
+
+            Token::Colon
+            | Token::Comma
+            | Token::Command(_)
+            | Token::Comment(_)
+            | Token::DataType(_)
+            | Token::Float(_)
+            | Token::Identifier(_)
+            | Token::Indentation
+            | Token::Integer(_)
+            | Token::Target(_)
+            | Token::LBracket
+            | Token::LParenthesis
+            | Token::NonBlocking
+            | Token::Modifier(_)
+            | Token::NewLine
+            | Token::Operator(_)
+            | Token::RBracket
+            | Token::RParenthesis
+            | Token::Semicolon
+            | Token::String(_)
+            | Token::Variable(_) => Err(()),
+        }
+    }
+}
+
 #[derive(Clone, PartialEq)]
 pub enum Token {
     As,
@@ -102,7 +175,7 @@ pub enum Token {
 impl fmt::Display for Token {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            Token::As => write!(f, "AS"),
+            Token::As => write!(f, "{}", KeywordToken::As),
             Token::Colon => write!(f, ":"),
             Token::Comma => write!(f, ","),
             Token::Command(cmd) => write!(f, "{cmd}"),
@@ -115,19 +188,19 @@ impl fmt::Display for Token {
             Token::Target(label) => write!(f, "{label}"),
             Token::LBracket => write!(f, "["),
             Token::LParenthesis => write!(f, "("),
-            Token::NonBlocking => write!(f, "NONBLOCKING"),
-            Token::Matrix => write!(f, "MATRIX"),
+            Token::NonBlocking => write!(f, "{}", KeywordToken::NonBlocking),
+            Token::Matrix => write!(f, "{}", KeywordToken::Matrix),
             Token::Modifier(m) => write!(f, "{m}"),
-            Token::Mutable => write!(f, "MUT"),
+            Token::Mutable => write!(f, "{}", KeywordToken::Mutable),
             Token::NewLine => write!(f, "NEWLINE"),
             Token::Operator(op) => write!(f, "{op}"),
-            Token::Offset => write!(f, "OFFSET"),
-            Token::PauliSum => write!(f, "PAULI-SUM"),
-            Token::Permutation => write!(f, "PERMUTATION"),
+            Token::Offset => write!(f, "{}", KeywordToken::Offset),
+            Token::PauliSum => write!(f, "{}", KeywordToken::PauliSum),
+            Token::Permutation => write!(f, "{}", KeywordToken::Permutation),
             Token::RBracket => write!(f, "]"),
             Token::RParenthesis => write!(f, ")"),
             Token::Semicolon => write!(f, ";"),
-            Token::Sharing => write!(f, "SHARING"),
+            Token::Sharing => write!(f, "{}", KeywordToken::Sharing),
             Token::String(s) => write!(f, "{}", QuotedString(s)),
             Token::Variable(v) => write!(f, "{v}"),
         }