Skip to content

Commit

Permalink
fix!: only parse keywords if they form the whole identifier
Browse files Browse the repository at this point in the history
This prevents `BITS` from being treated as the keyword `BIT` followed by the
identifier `S`.
  • Loading branch information
antalsz committed Jan 9, 2025
1 parent a6933ba commit 13f6fef
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 175 deletions.
116 changes: 17 additions & 99 deletions quil-rs/src/parser/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ mod error;
mod quoted_strings;
mod wrapped_parsers;

use std::str::FromStr;

use nom::{
bytes::complete::{is_a, take_till, take_while, take_while1},
character::complete::{digit1, one_of},
Expand All @@ -28,7 +30,7 @@ use nom::{
use nom_locate::LocatedSpan;
use wrapped_parsers::{alt, tag};

pub use super::token::{Token, TokenWithLocation};
pub use super::token::{KeywordToken, Token, TokenWithLocation};
use crate::parser::lexer::wrapped_parsers::expecting;
use crate::parser::token::token_with_location;
pub(crate) use error::InternalLexError;
Expand Down Expand Up @@ -92,7 +94,7 @@ pub enum Command {
Xor,
}

#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum DataType {
Bit,
Expand All @@ -101,7 +103,7 @@ pub enum DataType {
Integer,
}

#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum Modifier {
Controlled,
Expand Down Expand Up @@ -163,30 +165,14 @@ fn lex_token(input: LexInput) -> InternalLexResult<TokenWithLocation> {
"a token",
(
token_with_location(lex_comment),
token_with_location(lex_data_type),
token_with_location(lex_modifier),
token_with_location(lex_punctuation),
token_with_location(lex_target),
token_with_location(lex_string),
// Operator must come before number (or it may be parsed as a prefix)
token_with_location(lex_operator),
token_with_location(lex_number),
token_with_location(lex_variable),
token_with_location(lex_non_blocking),
// This should come last because it's sort of a catch all
token_with_location(lex_command_or_identifier),
),
)(input)
}

fn lex_data_type(input: LexInput) -> InternalLexResult {
alt(
"a data type",
(
value(Token::DataType(DataType::Bit), tag("BIT")),
value(Token::DataType(DataType::Integer), tag("INTEGER")),
value(Token::DataType(DataType::Octet), tag("OCTET")),
value(Token::DataType(DataType::Real), tag("REAL")),
token_with_location(lex_keyword_or_identifier),
),
)(input)
}
Expand All @@ -197,62 +183,16 @@ fn lex_comment(input: LexInput) -> InternalLexResult {
Ok((input, Token::Comment(content.to_string())))
}

/// If the given identifier string matches a command keyword, return the keyword;
/// otherwise, return the original identifier as a token.
fn recognize_command_or_identifier(identifier: String) -> Token {
use Command::*;

match identifier.as_str() {
"DEFGATE" => Token::Command(DefGate),
"ADD" => Token::Command(Add),
"AND" => Token::Command(And),
"CALL" => Token::Command(Call),
"CONVERT" => Token::Command(Convert),
"DIV" => Token::Command(Div),
"EQ" => Token::Command(Eq),
"EXCHANGE" => Token::Command(Exchange),
"GE" => Token::Command(GE),
"GT" => Token::Command(GT),
"IOR" => Token::Command(Ior),
"LE" => Token::Command(LE),
"LOAD" => Token::Command(Load),
"LT" => Token::Command(LT),
"MOVE" => Token::Command(Move),
"MUL" => Token::Command(Mul),
"NEG" => Token::Command(Neg),
"NOT" => Token::Command(Not),
"STORE" => Token::Command(Store),
"SUB" => Token::Command(Sub),
"XOR" => Token::Command(Xor),
"DEFCIRCUIT" => Token::Command(DefCircuit),
"MEASURE" => Token::Command(Measure),
"HALT" => Token::Command(Halt),
"WAIT" => Token::Command(Wait),
"JUMP-WHEN" => Token::Command(JumpWhen),
"JUMP-UNLESS" => Token::Command(JumpUnless),
"JUMP" => Token::Command(Jump),
"RESET" => Token::Command(Reset),
"NOP" => Token::Command(Nop),
"INCLUDE" => Token::Command(Include),
"PRAGMA" => Token::Command(Pragma),
"DECLARE" => Token::Command(Declare),
"CAPTURE" => Token::Command(Capture),
"DEFCAL" => Token::Command(DefCal),
"DEFFRAME" => Token::Command(DefFrame),
"DEFWAVEFORM" => Token::Command(DefWaveform),
"DELAY" => Token::Command(Delay),
"FENCE" => Token::Command(Fence),
"PULSE" => Token::Command(Pulse),
"RAW-CAPTURE" => Token::Command(RawCapture),
"SET-FREQUENCY" => Token::Command(SetFrequency),
"SET-PHASE" => Token::Command(SetPhase),
"SET-SCALE" => Token::Command(SetScale),
"SWAP-PHASES" => Token::Command(SwapPhases),
"SHIFT-FREQUENCY" => Token::Command(ShiftFrequency),
"SHIFT-PHASE" => Token::Command(ShiftPhase),
"LABEL" => Token::Command(Label),
_ => Token::Identifier(identifier),
fn keyword_or_identifier(identifier: String) -> Token {
fn parse<T: FromStr>(token: impl Fn(T) -> Token, identifier: &str) -> Result<Token, T::Err> {
T::from_str(identifier).map(token)
}

parse(KeywordToken::into, &identifier)
.or_else(|_| parse(Token::Command, &identifier))
.or_else(|_| parse(Token::DataType, &identifier))
.or_else(|_| parse(Token::Modifier, &identifier))
.unwrap_or(Token::Identifier(identifier))
}

fn is_valid_identifier_leading_character(chr: char) -> bool {
Expand Down Expand Up @@ -286,9 +226,9 @@ fn lex_identifier_raw(input: LexInput) -> InternalLexResult<String> {
)(input)
}

fn lex_command_or_identifier(input: LexInput) -> InternalLexResult {
fn lex_keyword_or_identifier(input: LexInput) -> InternalLexResult {
let (input, identifier) = lex_identifier_raw(input)?;
let token = recognize_command_or_identifier(identifier);
let token = keyword_or_identifier(identifier);
Ok((input, token))
}

Expand All @@ -298,10 +238,6 @@ fn lex_target(input: LexInput) -> InternalLexResult {
Ok((input, Token::Target(label)))
}

fn lex_non_blocking(input: LexInput) -> InternalLexResult {
value(Token::NonBlocking, tag("NONBLOCKING"))(input)
}

fn lex_number(input: LexInput) -> InternalLexResult {
let (input, float_string): (LexInput, LexInput) = recognize(double)(input)?;
let integer_parse_result: IResult<LexInput, _> = all_consuming(digit1)(float_string);
Expand All @@ -318,24 +254,6 @@ fn lex_number(input: LexInput) -> InternalLexResult {
))
}

fn lex_modifier(input: LexInput) -> InternalLexResult {
alt(
"a modifier token",
(
value(Token::As, tag("AS")),
value(Token::Matrix, tag("MATRIX")),
value(Token::Modifier(Modifier::Controlled), tag("CONTROLLED")),
value(Token::Modifier(Modifier::Dagger), tag("DAGGER")),
value(Token::Modifier(Modifier::Forked), tag("FORKED")),
value(Token::Mutable, tag("mut")),
value(Token::Offset, tag("OFFSET")),
value(Token::PauliSum, tag("PAULI-SUM")),
value(Token::Permutation, tag("PERMUTATION")),
value(Token::Sharing, tag("SHARING")),
),
)(input)
}

fn lex_operator(input: LexInput) -> InternalLexResult {
use Operator::*;
map(
Expand Down
4 changes: 2 additions & 2 deletions quil-rs/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ mod token;

pub(crate) use error::{ErrorInput, InternalParseError};
pub use error::{ParseError, ParserErrorKind};
pub use lexer::{DataType, LexError};
pub use token::{Token, TokenWithLocation};
pub use lexer::{Command, DataType, LexError, Modifier};
pub use token::{KeywordToken, Token, TokenWithLocation};

pub(crate) type ParserInput<'a> = &'a [TokenWithLocation<'a>];
type InternalParserResult<'a, R, E = InternalParseError<'a>> = IResult<ParserInput<'a>, R, E>;
Expand Down
91 changes: 80 additions & 11 deletions quil-rs/src/parser/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,78 @@ where
}
}

// When adding tokens that are keywords, you also need to update
// [`crate::reserved::ReservedKeyword`], and similarly for gates ([`crate::reserved::ReservedGate`])
// and constants ([`crate::reserved::ReservedConstant`]).
/// The subset of [`Token`]s which (a) do not contain more specific data and (b) are keywords. Used
/// to ensure that keyword-checking remains in sync with the definition of [`Token`].
#[derive(Debug, Copy, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
pub enum KeywordToken {
As,
Matrix,
#[strum(serialize = "mut")]
Mutable,
#[strum(serialize = "NONBLOCKING")]
NonBlocking,
Offset,
PauliSum,
Permutation,
Sharing,
}

impl From<KeywordToken> for Token {
fn from(token: KeywordToken) -> Self {
match token {
KeywordToken::As => Token::As,
KeywordToken::Matrix => Token::Matrix,
KeywordToken::Mutable => Token::Mutable,
KeywordToken::NonBlocking => Token::NonBlocking,
KeywordToken::Offset => Token::Offset,
KeywordToken::PauliSum => Token::PauliSum,
KeywordToken::Permutation => Token::Permutation,
KeywordToken::Sharing => Token::Sharing,
}
}
}

impl TryFrom<Token> for KeywordToken {
type Error = ();

fn try_from(token: Token) -> Result<Self, Self::Error> {
// This match is explicit so that if you add a new [`Token`] constructor you have to decide
// if it's a keyword. Please do not add a top-level wildcard match here.
match token {
Token::As => Ok(KeywordToken::As),
Token::Matrix => Ok(KeywordToken::Matrix),
Token::Mutable => Ok(KeywordToken::Mutable),
Token::Offset => Ok(KeywordToken::Offset),
Token::PauliSum => Ok(KeywordToken::PauliSum),
Token::Permutation => Ok(KeywordToken::Permutation),
Token::Sharing => Ok(KeywordToken::Sharing),

Token::Colon
| Token::Comma
| Token::Command(_)
| Token::Comment(_)
| Token::DataType(_)
| Token::Float(_)
| Token::Identifier(_)
| Token::Indentation
| Token::Integer(_)
| Token::Target(_)
| Token::LBracket
| Token::LParenthesis
| Token::NonBlocking
| Token::Modifier(_)
| Token::NewLine
| Token::Operator(_)
| Token::RBracket
| Token::RParenthesis
| Token::Semicolon
| Token::String(_)
| Token::Variable(_) => Err(()),
}
}
}

#[derive(Clone, PartialEq)]
pub enum Token {
As,
Expand Down Expand Up @@ -105,7 +174,7 @@ pub enum Token {
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::As => write!(f, "AS"),
Token::As => write!(f, "{}", KeywordToken::As),
Token::Colon => write!(f, ":"),
Token::Comma => write!(f, ","),
Token::Command(cmd) => write!(f, "{cmd}"),
Expand All @@ -118,19 +187,19 @@ impl fmt::Display for Token {
Token::Target(label) => write!(f, "{label}"),
Token::LBracket => write!(f, "["),
Token::LParenthesis => write!(f, "("),
Token::NonBlocking => write!(f, "NONBLOCKING"),
Token::Matrix => write!(f, "MATRIX"),
Token::NonBlocking => write!(f, "{}", KeywordToken::NonBlocking),
Token::Matrix => write!(f, "{}", KeywordToken::Matrix),
Token::Modifier(m) => write!(f, "{m}"),
Token::Mutable => write!(f, "mut"),
Token::Mutable => write!(f, "{}", KeywordToken::Mutable),
Token::NewLine => write!(f, "NEWLINE"),
Token::Operator(op) => write!(f, "{op}"),
Token::Offset => write!(f, "OFFSET"),
Token::PauliSum => write!(f, "PAULI-SUM"),
Token::Permutation => write!(f, "PERMUTATION"),
Token::Offset => write!(f, "{}", KeywordToken::Offset),
Token::PauliSum => write!(f, "{}", KeywordToken::PauliSum),
Token::Permutation => write!(f, "{}", KeywordToken::Permutation),
Token::RBracket => write!(f, "]"),
Token::RParenthesis => write!(f, ")"),
Token::Semicolon => write!(f, ";"),
Token::Sharing => write!(f, "SHARING"),
Token::Sharing => write!(f, "{}", KeywordToken::Sharing),
Token::String(s) => write!(f, "{}", QuotedString(s)),
Token::Variable(v) => write!(f, "{v}"),
}
Expand Down
Loading

0 comments on commit 13f6fef

Please sign in to comment.