Skip to content

Commit

Permalink
fix!: tokenize keywords and identifiers correctly (#428)
Browse files Browse the repository at this point in the history
Merge pull request #428 from rigetti/427-keyword-tokenization
  • Loading branch information
antalsz authored Jan 10, 2025
2 parents 51c6a66 + 4a65159 commit 74c2c6b
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 180 deletions.
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions quil-rs/proptest-regressions/expression/mod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@
# everyone who runs the test benefits from these saved cases.
cc 4c32128d724ed0f840715fae4e194c99262dc153c64be39d2acf45b8903b20f7 # shrinks to value = Complex { re: 0.0, im: -0.13530277317700273 }
cc 5cc95f2159ad7120bbaf296d3a9fb26fef30f57b61e76b3e0dc99f4759009fdb # shrinks to e = Number(Complex { re: 0.0, im: -2.772221265116396 })
cc de70a1853ccef983fac85a87761ba08bfb2d54b2d4e880d5d90e7b4a75ecafb5 # shrinks to e = Address(MemoryReference { name: "mut", index: 0 })
cc 9ad50859b68cb403ce1a67af0feef1f55d25587466878e364ba2810be5910b14 # shrinks to e = Address(MemoryReference { name: "iNf", index: 0 })
152 changes: 46 additions & 106 deletions quil-rs/src/parser/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ mod error;
mod quoted_strings;
mod wrapped_parsers;

use std::str::FromStr;

use nom::{
bytes::complete::{is_a, tag_no_case, take_till, take_while, take_while1},
bytes::complete::{is_a, take_till, take_while, take_while1},
character::complete::{digit1, one_of},
combinator::{all_consuming, map, recognize, value},
multi::many0,
Expand All @@ -28,7 +30,7 @@ use nom::{
use nom_locate::LocatedSpan;
use wrapped_parsers::{alt, tag};

pub use super::token::{Token, TokenWithLocation};
pub use super::token::{KeywordToken, Token, TokenWithLocation};
use crate::parser::lexer::wrapped_parsers::expecting;
use crate::parser::token::token_with_location;
pub(crate) use error::InternalLexError;
Expand Down Expand Up @@ -92,7 +94,7 @@ pub enum Command {
Xor,
}

#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum DataType {
Bit,
Expand All @@ -101,7 +103,7 @@ pub enum DataType {
Integer,
}

#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum Modifier {
Controlled,
Expand Down Expand Up @@ -163,30 +165,17 @@ fn lex_token(input: LexInput) -> InternalLexResult<TokenWithLocation> {
"a token",
(
token_with_location(lex_comment),
token_with_location(lex_data_type),
token_with_location(lex_modifier),
token_with_location(lex_punctuation),
token_with_location(lex_target),
token_with_location(lex_string),
// Operator must come before number (or it may be parsed as a prefix)
token_with_location(lex_operator),
token_with_location(lex_number),
token_with_location(lex_variable),
token_with_location(lex_non_blocking),
// This should come last because it's sort of a catch all
token_with_location(lex_command_or_identifier),
),
)(input)
}

fn lex_data_type(input: LexInput) -> InternalLexResult {
alt(
"a data type",
(
value(Token::DataType(DataType::Bit), tag("BIT")),
value(Token::DataType(DataType::Integer), tag("INTEGER")),
value(Token::DataType(DataType::Octet), tag("OCTET")),
value(Token::DataType(DataType::Real), tag("REAL")),
// Identifiers must come before numbers so that `NaN`, `Inf`, and `Infinity` aren't
// parsed as floats; Nom, as of version 7.1.1, will parse those strings,
// case-insensitively, as floats
token_with_location(lex_keyword_or_identifier),
token_with_location(lex_number),
),
)(input)
}
Expand All @@ -197,62 +186,16 @@ fn lex_comment(input: LexInput) -> InternalLexResult {
Ok((input, Token::Comment(content.to_string())))
}

/// If the given identifier string matches a command keyword, return the keyword;
/// otherwise, return the original identifier as a token.
fn recognize_command_or_identifier(identifier: String) -> Token {
use Command::*;

match identifier.as_str() {
"DEFGATE" => Token::Command(DefGate),
"ADD" => Token::Command(Add),
"AND" => Token::Command(And),
"CALL" => Token::Command(Call),
"CONVERT" => Token::Command(Convert),
"DIV" => Token::Command(Div),
"EQ" => Token::Command(Eq),
"EXCHANGE" => Token::Command(Exchange),
"GE" => Token::Command(GE),
"GT" => Token::Command(GT),
"IOR" => Token::Command(Ior),
"LE" => Token::Command(LE),
"LOAD" => Token::Command(Load),
"LT" => Token::Command(LT),
"MOVE" => Token::Command(Move),
"MUL" => Token::Command(Mul),
"NEG" => Token::Command(Neg),
"NOT" => Token::Command(Not),
"STORE" => Token::Command(Store),
"SUB" => Token::Command(Sub),
"XOR" => Token::Command(Xor),
"DEFCIRCUIT" => Token::Command(DefCircuit),
"MEASURE" => Token::Command(Measure),
"HALT" => Token::Command(Halt),
"WAIT" => Token::Command(Wait),
"JUMP-WHEN" => Token::Command(JumpWhen),
"JUMP-UNLESS" => Token::Command(JumpUnless),
"JUMP" => Token::Command(Jump),
"RESET" => Token::Command(Reset),
"NOP" => Token::Command(Nop),
"INCLUDE" => Token::Command(Include),
"PRAGMA" => Token::Command(Pragma),
"DECLARE" => Token::Command(Declare),
"CAPTURE" => Token::Command(Capture),
"DEFCAL" => Token::Command(DefCal),
"DEFFRAME" => Token::Command(DefFrame),
"DEFWAVEFORM" => Token::Command(DefWaveform),
"DELAY" => Token::Command(Delay),
"FENCE" => Token::Command(Fence),
"PULSE" => Token::Command(Pulse),
"RAW-CAPTURE" => Token::Command(RawCapture),
"SET-FREQUENCY" => Token::Command(SetFrequency),
"SET-PHASE" => Token::Command(SetPhase),
"SET-SCALE" => Token::Command(SetScale),
"SWAP-PHASES" => Token::Command(SwapPhases),
"SHIFT-FREQUENCY" => Token::Command(ShiftFrequency),
"SHIFT-PHASE" => Token::Command(ShiftPhase),
"LABEL" => Token::Command(Label),
_ => Token::Identifier(identifier),
fn keyword_or_identifier(identifier: String) -> Token {
fn parse<T: FromStr>(token: impl Fn(T) -> Token, identifier: &str) -> Result<Token, T::Err> {
T::from_str(identifier).map(token)
}

parse(KeywordToken::into, &identifier)
.or_else(|_| parse(Token::Command, &identifier))
.or_else(|_| parse(Token::DataType, &identifier))
.or_else(|_| parse(Token::Modifier, &identifier))
.unwrap_or(Token::Identifier(identifier))
}

fn is_valid_identifier_leading_character(chr: char) -> bool {
Expand Down Expand Up @@ -286,9 +229,9 @@ fn lex_identifier_raw(input: LexInput) -> InternalLexResult<String> {
)(input)
}

fn lex_command_or_identifier(input: LexInput) -> InternalLexResult {
fn lex_keyword_or_identifier(input: LexInput) -> InternalLexResult {
let (input, identifier) = lex_identifier_raw(input)?;
let token = recognize_command_or_identifier(identifier);
let token = keyword_or_identifier(identifier);
Ok((input, token))
}

Expand All @@ -298,10 +241,6 @@ fn lex_target(input: LexInput) -> InternalLexResult {
Ok((input, Token::Target(label)))
}

fn lex_non_blocking(input: LexInput) -> InternalLexResult {
value(Token::NonBlocking, tag("NONBLOCKING"))(input)
}

fn lex_number(input: LexInput) -> InternalLexResult {
let (input, float_string): (LexInput, LexInput) = recognize(double)(input)?;
let integer_parse_result: IResult<LexInput, _> = all_consuming(digit1)(float_string);
Expand All @@ -318,24 +257,6 @@ fn lex_number(input: LexInput) -> InternalLexResult {
))
}

fn lex_modifier(input: LexInput) -> InternalLexResult {
alt(
"a modifier token",
(
value(Token::As, tag("AS")),
value(Token::Matrix, tag("MATRIX")),
value(Token::Modifier(Modifier::Controlled), tag("CONTROLLED")),
value(Token::Modifier(Modifier::Dagger), tag("DAGGER")),
value(Token::Modifier(Modifier::Forked), tag("FORKED")),
value(Token::Mutable, tag_no_case("MUT")),
value(Token::Offset, tag("OFFSET")),
value(Token::PauliSum, tag("PAULI-SUM")),
value(Token::Permutation, tag("PERMUTATION")),
value(Token::Sharing, tag("SHARING")),
),
)(input)
}

fn lex_operator(input: LexInput) -> InternalLexResult {
use Operator::*;
map(
Expand Down Expand Up @@ -399,7 +320,7 @@ mod tests {
use nom_locate::LocatedSpan;
use rstest::*;

use crate::parser::common::tests::KITCHEN_SINK_QUIL;
use crate::parser::{common::tests::KITCHEN_SINK_QUIL, DataType};

use super::{lex, Command, Operator, Token};

Expand Down Expand Up @@ -565,10 +486,29 @@ mod tests {
case("a", vec![Token::Identifier("a".to_string())]),
case("_a-2_b-2_", vec![Token::Identifier("_a-2_b-2_".to_string())]),
case("a-2-%var", vec![
Token::Identifier("a-2".to_string()),
Token::Operator(Operator::Minus),
Token::Variable("var".to_string())
])
Token::Identifier("a-2".to_string()),
Token::Operator(Operator::Minus),
Token::Variable("var".to_string())
]),
case("BIT", vec![Token::DataType(DataType::Bit)]),
case("BITS", vec![Token::Identifier("BITS".to_string())]),
case("NaN", vec![Token::Identifier("NaN".to_string())]),
case("nan", vec![Token::Identifier("nan".to_string())]),
case("NaNa", vec![Token::Identifier("NaNa".to_string())]),
case("nana", vec![Token::Identifier("nana".to_string())]),
case("INF", vec![Token::Identifier("INF".to_string())]),
case("Infinity", vec![Token::Identifier("Infinity".to_string())]),
case("Inferior", vec![Token::Identifier("Inferior".to_string())]),
case("-NaN", vec![Token::Operator(Operator::Minus), Token::Identifier("NaN".to_string())]),
case("-inf", vec![Token::Operator(Operator::Minus), Token::Identifier("inf".to_string())]),
case("-Infinity", vec![
Token::Operator(Operator::Minus),
Token::Identifier("Infinity".to_string())
]),
case("-inferior", vec![
Token::Operator(Operator::Minus),
Token::Identifier("inferior".to_string())
]),
)]
fn it_lexes_identifier(input: &str, expected: Vec<Token>) {
let input = LocatedSpan::new(input);
Expand Down
4 changes: 2 additions & 2 deletions quil-rs/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ mod token;

pub(crate) use error::{ErrorInput, InternalParseError};
pub use error::{ParseError, ParserErrorKind};
pub use lexer::LexError;
pub use token::{Token, TokenWithLocation};
pub use lexer::{Command, DataType, LexError, Modifier};
pub use token::{KeywordToken, Token, TokenWithLocation};

pub(crate) type ParserInput<'a> = &'a [TokenWithLocation<'a>];
type InternalParserResult<'a, R, E = InternalParseError<'a>> = IResult<ParserInput<'a>, R, E>;
Expand Down
89 changes: 81 additions & 8 deletions quil-rs/src/parser/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,79 @@ where
}
}

/// The subset of [`Token`]s which (a) do not have arguments and (b) are keywords. Used to ensure
/// that keyword-checking remains in sync with the definition of [`Token`].
#[derive(Debug, Copy, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
pub enum KeywordToken {
As,
Matrix,
#[strum(serialize = "mut")]
Mutable,
#[strum(serialize = "NONBLOCKING")]
NonBlocking,
Offset,
PauliSum,
Permutation,
Sharing,
}

impl From<KeywordToken> for Token {
fn from(token: KeywordToken) -> Self {
match token {
KeywordToken::As => Token::As,
KeywordToken::Matrix => Token::Matrix,
KeywordToken::Mutable => Token::Mutable,
KeywordToken::NonBlocking => Token::NonBlocking,
KeywordToken::Offset => Token::Offset,
KeywordToken::PauliSum => Token::PauliSum,
KeywordToken::Permutation => Token::Permutation,
KeywordToken::Sharing => Token::Sharing,
}
}
}

impl TryFrom<Token> for KeywordToken {
type Error = ();

fn try_from(token: Token) -> Result<Self, Self::Error> {
// This match is explicit so that if you add a new [`Token`] constructor you have to decide
// if it's a keyword.
#[deny(clippy::wildcard_enum_match_arm, clippy::wildcard_in_or_patterns)]
match token {
Token::As => Ok(KeywordToken::As),
Token::Matrix => Ok(KeywordToken::Matrix),
Token::Mutable => Ok(KeywordToken::Mutable),
Token::Offset => Ok(KeywordToken::Offset),
Token::PauliSum => Ok(KeywordToken::PauliSum),
Token::Permutation => Ok(KeywordToken::Permutation),
Token::Sharing => Ok(KeywordToken::Sharing),

Token::Colon
| Token::Comma
| Token::Command(_)
| Token::Comment(_)
| Token::DataType(_)
| Token::Float(_)
| Token::Identifier(_)
| Token::Indentation
| Token::Integer(_)
| Token::Target(_)
| Token::LBracket
| Token::LParenthesis
| Token::NonBlocking
| Token::Modifier(_)
| Token::NewLine
| Token::Operator(_)
| Token::RBracket
| Token::RParenthesis
| Token::Semicolon
| Token::String(_)
| Token::Variable(_) => Err(()),
}
}
}

#[derive(Clone, PartialEq)]
pub enum Token {
As,
Expand Down Expand Up @@ -102,7 +175,7 @@ pub enum Token {
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::As => write!(f, "AS"),
Token::As => write!(f, "{}", KeywordToken::As),
Token::Colon => write!(f, ":"),
Token::Comma => write!(f, ","),
Token::Command(cmd) => write!(f, "{cmd}"),
Expand All @@ -115,19 +188,19 @@ impl fmt::Display for Token {
Token::Target(label) => write!(f, "{label}"),
Token::LBracket => write!(f, "["),
Token::LParenthesis => write!(f, "("),
Token::NonBlocking => write!(f, "NONBLOCKING"),
Token::Matrix => write!(f, "MATRIX"),
Token::NonBlocking => write!(f, "{}", KeywordToken::NonBlocking),
Token::Matrix => write!(f, "{}", KeywordToken::Matrix),
Token::Modifier(m) => write!(f, "{m}"),
Token::Mutable => write!(f, "MUT"),
Token::Mutable => write!(f, "{}", KeywordToken::Mutable),
Token::NewLine => write!(f, "NEWLINE"),
Token::Operator(op) => write!(f, "{op}"),
Token::Offset => write!(f, "OFFSET"),
Token::PauliSum => write!(f, "PAULI-SUM"),
Token::Permutation => write!(f, "PERMUTATION"),
Token::Offset => write!(f, "{}", KeywordToken::Offset),
Token::PauliSum => write!(f, "{}", KeywordToken::PauliSum),
Token::Permutation => write!(f, "{}", KeywordToken::Permutation),
Token::RBracket => write!(f, "]"),
Token::RParenthesis => write!(f, ")"),
Token::Semicolon => write!(f, ";"),
Token::Sharing => write!(f, "SHARING"),
Token::Sharing => write!(f, "{}", KeywordToken::Sharing),
Token::String(s) => write!(f, "{}", QuotedString(s)),
Token::Variable(v) => write!(f, "{v}"),
}
Expand Down
Loading

0 comments on commit 74c2c6b

Please sign in to comment.