Lex Jupyter line magic with `Mode::Jupyter` (#23)

Lex Jupyter line magic with `Mode::Jupyter`

This PR adds a new token `MagicCommand`[^1] which the lexer will
recognize when in `Mode::Jupyter`. The rules for the lexer is as
follows:
1. Given that we are at the start of line, skip the indentation and look
for [characters that represent the start of a magic
command](635815e8f1/IPython/core/inputtransformer2.py (L335-L346)),
determine the magic kind and capture all the characters following it as
the command string.
2. If the command extends multiple lines, the lexer will skip the line
continuation character (`\`) but only if it's followed by a newline
(`\n` or `\r`). The reason to skip this only in case of newline is
because they can occur in the command string which we should not skip:

	```rust
    //        Skip this backslash
    //        v
    //   !pwd \
    //      && ls -a | sed 's/^/\\    /'
    //                          ^^
    //                          Don't skip these backslashes
	```

3. The parser, when in `Mode::Jupyter`, will filter these tokens before
the parsing begins. There is a small caveat when the magic command is
indented. In the following example, when the parser filters out magic
command, it'll throw an indentation error:

	```python
	for i in range(5):
		!ls

	# What the parser will see
	for i in range(5):
	
	```

[^1]: I would prefer to have some other name as this not only represent
a line magic (`%`) but also shell command (`!`), help command (`?`) and
others. In original implementation, it's named as ["IPython
Syntax"](635815e8f1/IPython/core/inputtransformer2.py (L332))
This commit is contained in:
Dhruv Manilawala 2023-07-18 09:24:24 +05:30 committed by GitHub
parent 126652b684
commit 3b4c8fffe5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 625 additions and 9 deletions

View File

@ -1,7 +1,7 @@
//! Control in the different modes by which a source file can be parsed.
/// The mode argument specifies in what way code must be parsed.
#[derive(Clone, Copy, Hash, PartialEq, Eq)]
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)]
pub enum Mode {
/// The code consists of a sequence of statements.
Module,
@ -9,6 +9,34 @@ pub enum Mode {
Interactive,
/// The code consists of a single expression.
Expression,
/// The code consists of a sequence of statements which are part of a
/// Jupyter Notebook and thus could include escape commands scoped to
/// a single line.
///
/// ## Limitations:
///
/// These escaped commands are only supported when they are the only
/// statement on a line. If they're part of a larger statement such as
/// on the right-hand side of an assignment, the lexer will not recognize
/// them as escape commands.
///
/// For [Dynamic object information], the escape characters (`?`, `??`)
/// must be used before an object. For example, `?foo` will be recognized,
/// but `foo?` will not.
///
/// ## Supported escape commands:
/// - [Magic command system] which is limited to [line magics] and can start
/// with `?` or `??`.
/// - [Dynamic object information] which can start with `?` or `??`.
/// - [System shell access] which can start with `!` or `!!`.
/// - [Automatic parentheses and quotes] which can start with `/`, `;`, or `,`.
///
/// [Magic command system]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#magic-command-system
/// [line magics]: https://ipython.readthedocs.io/en/stable/interactive/magics.html#line-magics
/// [Dynamic object information]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#dynamic-object-information
/// [System shell access]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#system-shell-access
/// [Automatic parentheses and quotes]: https://ipython.readthedocs.io/en/stable/interactive/reference.html#automatic-parentheses-and-quotes
Jupyter,
}
impl std::str::FromStr for Mode {
@ -17,6 +45,7 @@ impl std::str::FromStr for Mode {
match s {
"exec" | "single" => Ok(Mode::Module),
"eval" => Ok(Mode::Expression),
"jupyter" => Ok(Mode::Jupyter),
_ => Err(ModeParseError),
}
}
@ -28,6 +57,6 @@ pub struct ModeParseError;
impl std::fmt::Display for ModeParseError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, r#"mode must be "exec", "eval", or "single""#)
write!(f, r#"mode must be "exec", "eval", "jupyter", or "single""#)
}
}

View File

@ -32,7 +32,7 @@ use crate::{
soft_keywords::SoftKeywordTransformer,
string::FStringErrorType,
text_size::{TextLen, TextRange, TextSize},
token::{StringKind, Tok},
token::{MagicKind, StringKind, Tok},
Mode,
};
use log::trace;
@ -175,6 +175,8 @@ pub struct Lexer<T: Iterator<Item = char>> {
pending: Vec<Spanned>,
// The current location.
location: TextSize,
// Lexer mode.
mode: Mode,
}
// generated in build.rs, in gen_phf()
@ -213,7 +215,7 @@ pub fn lex_starts_at(
mode: Mode,
start_offset: TextSize,
) -> SoftKeywordTransformer<Lexer<std::str::Chars<'_>>> {
SoftKeywordTransformer::new(Lexer::new(source.chars(), start_offset), mode)
SoftKeywordTransformer::new(Lexer::new(source.chars(), mode, start_offset), mode)
}
impl<T> Lexer<T>
@ -222,7 +224,7 @@ where
{
/// Create a new lexer from T and a starting location. You probably want to use
/// [`lex`] instead.
pub fn new(input: T, start: TextSize) -> Self {
pub fn new(input: T, mode: Mode, start: TextSize) -> Self {
let mut lxr = Lexer {
at_begin_of_line: true,
nesting: 0,
@ -231,6 +233,7 @@ where
pending: Vec::with_capacity(5),
location: start,
window: CharWindow::new(input),
mode,
};
// Fill the window.
lxr.window.slide();
@ -494,6 +497,59 @@ where
Ok(())
}
/// Lex a single magic command.
fn lex_magic_command(&mut self, kind: MagicKind) -> (Tok, TextRange) {
let start_pos = self.get_pos();
for _ in 0..u32::from(kind.prefix_len()) {
self.next_char();
}
let mut value = String::new();
loop {
match self.window[0] {
Some('\\') => {
// Only skip the line continuation if it is followed by a newline
// otherwise it is a normal backslash which is part of the magic command:
//
// Skip this backslash
// v
// !pwd \
// && ls -a | sed 's/^/\\ /'
// ^^
// Don't skip these backslashes
if matches!(self.window[1], Some('\n' | '\r')) {
self.next_char();
self.next_char();
continue;
}
}
Some('\n' | '\r') | None => {
let end_pos = self.get_pos();
return (
Tok::MagicCommand { kind, value },
TextRange::new(start_pos, end_pos),
);
}
Some(_) => {}
}
value.push(self.next_char().unwrap());
}
}
fn lex_and_emit_magic_command(&mut self) {
let kind = match self.window[..2] {
[Some(c1), Some(c2)] => {
MagicKind::try_from([c1, c2]).map_or_else(|_| MagicKind::try_from(c1), Ok)
}
// When the escape character is the last character of the file.
[Some(c), None] => MagicKind::try_from(c),
_ => return,
};
if let Ok(kind) = kind {
let magic_command = self.lex_magic_command(kind);
self.emit(magic_command);
}
}
/// Lex a string literal.
fn lex_string(&mut self, kind: StringKind) -> LexResult {
let start_pos = self.get_pos();
@ -644,6 +700,10 @@ where
spaces = 0;
tabs = 0;
}
// https://github.com/ipython/ipython/blob/635815e8f1ded5b764d66cacc80bbe25e9e2587f/IPython/core/inputtransformer2.py#L345
Some('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter => {
self.lex_and_emit_magic_command();
}
Some('\x0C') => {
// Form feed character!
// Reset indentation for the Emacs user.
@ -1381,6 +1441,11 @@ mod tests {
lexer.map(|x| x.unwrap().0).collect()
}
pub fn lex_jupyter_source(source: &str) -> Vec<Tok> {
let lexer = lex(source, Mode::Jupyter);
lexer.map(|x| x.unwrap().0).collect()
}
fn str_tok(s: &str) -> Tok {
Tok::String {
value: s.to_owned(),
@ -1397,6 +1462,213 @@ mod tests {
}
}
fn assert_jupyter_magic_line_continuation_with_eol(eol: &str) {
let source = format!("%matplotlib \\{} --inline", eol);
let tokens = lex_jupyter_source(&source);
assert_eq!(
tokens,
vec![Tok::MagicCommand {
value: "matplotlib --inline".to_string(),
kind: MagicKind::Magic
},]
)
}
#[test]
fn test_jupyter_magic_line_continuation_unix_eol() {
assert_jupyter_magic_line_continuation_with_eol(UNIX_EOL);
}
#[test]
fn test_jupyter_magic_line_continuation_mac_eol() {
assert_jupyter_magic_line_continuation_with_eol(MAC_EOL);
}
#[test]
fn test_jupyter_magic_line_continuation_windows_eol() {
assert_jupyter_magic_line_continuation_with_eol(WINDOWS_EOL);
}
fn assert_jupyter_magic_line_continuation_with_eol_and_eof(eol: &str) {
let source = format!("%matplotlib \\{}", eol);
let tokens = lex_jupyter_source(&source);
assert_eq!(
tokens,
vec![Tok::MagicCommand {
value: "matplotlib ".to_string(),
kind: MagicKind::Magic
},]
)
}
#[test]
fn test_jupyter_magic_line_continuation_unix_eol_and_eof() {
assert_jupyter_magic_line_continuation_with_eol_and_eof(UNIX_EOL);
}
#[test]
fn test_jupyter_magic_line_continuation_mac_eol_and_eof() {
assert_jupyter_magic_line_continuation_with_eol_and_eof(MAC_EOL);
}
#[test]
fn test_jupyter_magic_line_continuation_windows_eol_and_eof() {
assert_jupyter_magic_line_continuation_with_eol_and_eof(WINDOWS_EOL);
}
#[test]
fn test_empty_jupyter_magic() {
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
let tokens = lex_jupyter_source(source);
assert_eq!(
tokens,
vec![
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Magic2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Shell,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::ShCap,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Help,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Help2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Paren,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Quote,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "".to_string(),
kind: MagicKind::Quote2,
},
]
)
}
#[test]
fn test_jupyter_magic() {
let source = r"
?foo
??foo
%timeit a = b
%timeit a % 3
%matplotlib \
--inline
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
/foo 1 2
,foo 1 2
;foo 1 2
!ls
"
.trim();
let tokens = lex_jupyter_source(source);
assert_eq!(
tokens,
vec![
Tok::MagicCommand {
value: "foo".to_string(),
kind: MagicKind::Help,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo".to_string(),
kind: MagicKind::Help2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "timeit a = b".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "timeit a % 3".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "matplotlib --inline".to_string(),
kind: MagicKind::Magic,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "pwd && ls -a | sed 's/^/\\\\ /'".to_string(),
kind: MagicKind::Shell,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "cd /Users/foo/Library/Application\\ Support/".to_string(),
kind: MagicKind::ShCap,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo 1 2".to_string(),
kind: MagicKind::Paren,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo 1 2".to_string(),
kind: MagicKind::Quote,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "foo 1 2".to_string(),
kind: MagicKind::Quote2,
},
#[cfg(feature = "full-lexer")]
Tok::NonLogicalNewline,
Tok::MagicCommand {
value: "ls".to_string(),
kind: MagicKind::Shell,
},
]
)
}
#[test]
fn test_numbers() {
let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j";

View File

@ -250,7 +250,7 @@ impl Parse for ast::Constant {
}
/// Parse a full Python program usually consisting of multiple lines.
///
///
/// This is a convenience function that can be used to parse a full Python program without having to
/// specify the [`Mode`] or the location. It is probably what you want to use most of the time.
///
@ -326,7 +326,8 @@ pub fn parse_expression_starts_at(
/// Parse the given Python source code using the specified [`Mode`].
///
/// This function is the most general function to parse Python code. Based on the [`Mode`] supplied,
/// it can be used to parse a single expression, a full Python program or an interactive expression.
/// it can be used to parse a single expression, a full Python program, an interactive expression
/// or a Python program containing Jupyter magics.
///
/// # Example
///
@ -354,6 +355,20 @@ pub fn parse_expression_starts_at(
/// let program = parse(source, Mode::Module, "<embedded>");
/// assert!(program.is_ok());
/// ```
///
/// Additionally, we can parse a Python program containing Jupyter magics:
///
/// ```
/// use rustpython_parser::{Mode, parse};
///
/// let source = r#"
/// %timeit 1 + 2
/// ?str.replace
/// !ls
/// "#;
/// let program = parse(source, Mode::Jupyter, "<embedded>");
/// assert!(program.is_ok());
/// ```
pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, ParseError> {
parse_starts_at(source, mode, source_path, TextSize::default())
}
@ -394,6 +409,9 @@ pub fn parse_starts_at(
///
/// This could allow you to perform some preprocessing on the tokens before parsing them.
///
/// When in [`Mode::Jupyter`], this will filter out all the Jupyter magic commands
/// before parsing the tokens.
///
/// # Example
///
/// As an example, instead of parsing a string, we can parse a list of tokens after we generate
@ -414,7 +432,12 @@ pub fn parse_tokens(
#[cfg(feature = "full-lexer")]
let lxr =
lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
parse_filtered_tokens(lxr, mode, source_path)
if mode == Mode::Jupyter {
let lxr = lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::MagicCommand { .. }));
parse_filtered_tokens(lxr, mode, source_path)
} else {
parse_filtered_tokens(lxr, mode, source_path)
}
}
fn parse_filtered_tokens(
@ -1238,4 +1261,65 @@ class Abcd:
.unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
#[test]
fn test_jupyter_magic() {
let parse_ast = parse(
r#"
# Normal Python code
(
a
%
b
)
# Dynamic object info
??a.foo
?a.foo
?a.foo?
??a.foo()??
# Line magic
%timeit a = b
%timeit foo(b) % 3
%alias showPath pwd && ls -a
%timeit a =\
foo(b); b = 2
%matplotlib --inline
%matplotlib \
--inline
# System shell access
!pwd && ls -a | sed 's/^/\ /'
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
# Let's add some Python code to make sure that earlier escapes were handled
# correctly and that we didn't consume any of the following code as a result
# of the escapes.
def foo():
return (
a
!=
b
)
# Transforms into `foo(..)`
/foo 1 2
;foo 1 2
,foo 1 2
# Indented magic
for a in range(5):
%ls
pass
"#
.trim(),
Mode::Jupyter,
"<test>",
)
.unwrap();
insta::assert_debug_snapshot!(parse_ast);
}
}

View File

@ -0,0 +1,135 @@
---
source: parser/src/parser.rs
expression: parse_ast
---
Module(
ModModule {
range: 0..736,
body: [
Expr(
StmtExpr {
range: 21..42,
value: BinOp(
ExprBinOp {
range: 27..40,
left: Name(
ExprName {
range: 27..28,
id: "a",
ctx: Load,
},
),
op: Mod,
right: Name(
ExprName {
range: 39..40,
id: "b",
ctx: Load,
},
),
},
),
},
),
FunctionDef(
StmtFunctionDef {
range: 566..626,
name: Identifier {
id: "foo",
range: 570..573,
},
args: Arguments {
range: 573..575,
posonlyargs: [],
args: [],
vararg: None,
kwonlyargs: [],
kwarg: None,
},
body: [
Return(
StmtReturn {
range: 581..626,
value: Some(
Compare(
ExprCompare {
range: 598..620,
left: Name(
ExprName {
range: 598..599,
id: "a",
ctx: Load,
},
),
ops: [
NotEq,
],
comparators: [
Name(
ExprName {
range: 619..620,
id: "b",
ctx: Load,
},
),
],
},
),
),
},
),
],
decorator_list: [],
returns: None,
type_comment: None,
},
),
For(
StmtFor {
range: 701..736,
target: Name(
ExprName {
range: 705..706,
id: "a",
ctx: Store,
},
),
iter: Call(
ExprCall {
range: 710..718,
func: Name(
ExprName {
range: 710..715,
id: "range",
ctx: Load,
},
),
args: [
Constant(
ExprConstant {
range: 716..717,
value: Int(
5,
),
kind: None,
},
),
],
keywords: [],
},
),
body: [
Pass(
StmtPass {
range: 732..736,
},
),
],
orelse: [],
type_comment: None,
},
),
],
type_ignores: [],
},
)

View File

@ -42,6 +42,14 @@ pub enum Tok {
/// Whether the string is triple quoted.
triple_quoted: bool,
},
/// Token value for a Jupyter magic commands. These are filtered out of the token stream
/// prior to parsing when the mode is [`Mode::Jupyter`].
MagicCommand {
/// The magic command value.
value: String,
/// The kind of magic command.
kind: MagicKind,
},
/// Token value for a comment. These are filtered out of the token stream prior to parsing.
#[cfg(feature = "full-lexer")]
Comment(String),
@ -202,7 +210,7 @@ pub enum Tok {
impl Tok {
pub fn start_marker(mode: Mode) -> Self {
match mode {
Mode::Module => Tok::StartModule,
Mode::Module | Mode::Jupyter => Tok::StartModule,
Mode::Interactive => Tok::StartInteractive,
Mode::Expression => Tok::StartExpression,
}
@ -225,6 +233,7 @@ impl fmt::Display for Tok {
let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 });
write!(f, "{kind}{quotes}{value}{quotes}")
}
MagicCommand { kind, value } => write!(f, "{kind}{value}"),
Newline => f.write_str("Newline"),
#[cfg(feature = "full-lexer")]
NonLogicalNewline => f.write_str("NonLogicalNewline"),
@ -325,6 +334,93 @@ impl fmt::Display for Tok {
}
}
/// The kind of magic command as defined in [IPython Syntax] in the IPython codebase.
///
/// [IPython Syntax]: https://github.com/ipython/ipython/blob/635815e8f1ded5b764d66cacc80bbe25e9e2587f/IPython/core/inputtransformer2.py#L335-L343
#[derive(PartialEq, Eq, Debug, Clone, Hash, Copy)]
pub enum MagicKind {
/// Send line to underlying system shell.
Shell,
/// Send line to system shell and capture output.
ShCap,
/// Show help on object.
Help,
/// Show help on object, with extra verbosity.
Help2,
/// Call magic function.
Magic,
/// Call cell magic function.
Magic2,
/// Call first argument with rest of line as arguments after splitting on whitespace
/// and quote each as string.
Quote,
/// Call first argument with rest of line as an argument quoted as a single string.
Quote2,
/// Call first argument with rest of line as arguments.
Paren,
}
impl TryFrom<char> for MagicKind {
type Error = String;
fn try_from(ch: char) -> Result<Self, Self::Error> {
match ch {
'!' => Ok(MagicKind::Shell),
'?' => Ok(MagicKind::Help),
'%' => Ok(MagicKind::Magic),
',' => Ok(MagicKind::Quote),
';' => Ok(MagicKind::Quote2),
'/' => Ok(MagicKind::Paren),
_ => Err(format!("Unexpected magic escape: {ch}")),
}
}
}
impl TryFrom<[char; 2]> for MagicKind {
type Error = String;
fn try_from(ch: [char; 2]) -> Result<Self, Self::Error> {
match ch {
['!', '!'] => Ok(MagicKind::ShCap),
['?', '?'] => Ok(MagicKind::Help2),
['%', '%'] => Ok(MagicKind::Magic2),
[c1, c2] => Err(format!("Unexpected magic escape: {c1}{c2}")),
}
}
}
impl fmt::Display for MagicKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MagicKind::Shell => f.write_str("!"),
MagicKind::ShCap => f.write_str("!!"),
MagicKind::Help => f.write_str("?"),
MagicKind::Help2 => f.write_str("??"),
MagicKind::Magic => f.write_str("%"),
MagicKind::Magic2 => f.write_str("%%"),
MagicKind::Quote => f.write_str(","),
MagicKind::Quote2 => f.write_str(";"),
MagicKind::Paren => f.write_str("/"),
}
}
}
impl MagicKind {
/// Returns the length of the magic command prefix.
pub fn prefix_len(self) -> TextSize {
let len = match self {
MagicKind::Shell
| MagicKind::Magic
| MagicKind::Help
| MagicKind::Quote
| MagicKind::Quote2
| MagicKind::Paren => 1,
MagicKind::ShCap | MagicKind::Magic2 | MagicKind::Help2 => 2,
};
len.into()
}
}
/// The kind of string literal as described in the [String and Bytes literals]
/// section of the Python reference.
///