Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/dialect/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ impl Dialect for ClickHouseDialect {
true
}

/// See <https://clickhouse.com/docs/sql-reference/functions/tuple-functions#tupleelement>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this behavior differ from parse_compound_expr handling (if so how, on a first glance they look identical syntax wise)?

fn supports_tuple_element_access(&self) -> bool {
true
}

// ClickHouse uses this for some FORMAT expressions in `INSERT` context, e.g. when inserting
// with FORMAT JSONEachRow a raw JSON key-value expression is valid and expected.
//
Expand Down
12 changes: 12 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,18 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports ClickHouse-style positional tuple
/// element access such as `tup.1`, `arr[1].2`, or `(1, 2, 3).3`. When
/// enabled, the tokenizer emits a standalone `.` (instead of fusing it
/// into a decimal literal) when the previous token can be the left-hand
/// side of a tuple access an identifier, `]`, `)`, or another integer
/// already inside the chain.
///
/// See <https://clickhouse.com/docs/sql-reference/functions/tuple-functions#tupleelement>
fn supports_tuple_element_access(&self) -> bool {
false
}

/// Returns true if the dialect supports numbers containing underscores, e.g. `10_000_000`
fn supports_numeric_literal_underscores(&self) -> bool {
false
Expand Down
7 changes: 7 additions & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1337,6 +1337,13 @@ impl<'a> Parser<'a> {
AttachedToken(next_token),
));
}
Token::Number(_, _) if self.dialect.supports_tuple_element_access() => {
// ClickHouse-style positional tuple access (`t.1`,
// `t.1.2`). Exit the wildcard-detection loop and let
// `parse_expr` handle the expression via the
// index-rewind fall-through below.
break;
}
_ => {
return self.expected("an identifier or a '*' after '.'", next_token);
}
Expand Down
128 changes: 124 additions & 4 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1381,10 +1381,18 @@ impl<'a> Tokenizer<'a> {
return Ok(Some(Token::HexStringLiteral(s2)));
}

// match one period
if let Some('.') = chars.peek() {
s.push('.');
chars.next();
// match one period. if we've just consumed an integer
// and the previous token is `.`, we're inside a ClickHouse
// tuple element access chain, and the trailing dot belongs
// to the chain, not to this number.
let in_tuple_chain = self.dialect.supports_tuple_element_access()
&& prev_token == Some(&Token::Period)
&& !s.is_empty();
if !in_tuple_chain {
if let Some('.') = chars.peek() {
s.push('.');
chars.next();
}
}

// If the dialect supports identifiers that start with a numeric prefix
Expand All @@ -1398,6 +1406,26 @@ impl<'a> Tokenizer<'a> {
}
}

// ClickHouse-style positional tuple element access: emit `.` as a
// standalone Period when it follows the LHS of a chain (an
// identifier, `]`, `)`, or another integer already in the chain),
// so e.g. `arr[1].1` and `t.1.2` parse as `CompoundFieldAccess`
// instead of being fused into a decimal literal.
if s == "."
&& self.dialect.supports_tuple_element_access()
&& matches!(
prev_token,
Some(
Token::Word(_)
| Token::RBracket
| Token::RParen
| Token::Number(_, _)
)
)
{
return Ok(Some(Token::Period));
}

// Consume fractional digits.
s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
Expand Down Expand Up @@ -4303,6 +4331,98 @@ mod tests {
);
}

#[test]
fn tokenize_clickhouse_tuple_element_access() {
let dialects = all_dialects_where(|dialect| dialect.supports_tuple_element_access());

// After a Word, RBracket, or RParen, `.<digit>` is split into `Period`
// and a separate integer `Number`, so the parser can build a
// CompoundFieldAccess instead of seeing a single decimal literal.
dialects.tokenizes_to(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I follow the patch/tests - are we saying that without this PR, a sql like t.1 tokenizes to something other than what's being asserted (if so what does the previous behavior tokenized to)?

"t.1",
vec![
Token::make_word("t", None),
Token::Period,
Token::Number("1".to_string(), false),
],
);

dialects.tokenizes_to(
"arr[1].2",
vec![
Token::make_word("arr", None),
Token::LBracket,
Token::Number("1".to_string(), false),
Token::RBracket,
Token::Period,
Token::Number("2".to_string(), false),
],
);

dialects.tokenizes_to(
"(1,2).2",
vec![
Token::LParen,
Token::Number("1".to_string(), false),
Token::Comma,
Token::Number("2".to_string(), false),
Token::RParen,
Token::Period,
Token::Number("2".to_string(), false),
],
);

// Nested access `tup.1.2` (Tuple of Tuple) — the rule must re-fire on
// the second dot, and the integer between the two dots must not eat
// the trailing dot as a decimal fraction.
dialects.tokenizes_to(
"t.1.2",
vec![
Token::make_word("t", None),
Token::Period,
Token::Number("1".to_string(), false),
Token::Period,
Token::Number("2".to_string(), false),
],
);

// Decimal literals must remain untouched: the previous token is
// either whitespace or a number, never the LHS of an access chain.
dialects.tokenizes_to(
"SELECT 0.5",
vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("0.5".to_string(), false),
],
);

dialects.tokenizes_to(
"SELECT .5",
vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(".5".to_string(), false),
],
);

// Regression: dialects without the flag keep the old behavior. The
// dot and digit fuse into a single decimal-shaped Number token.
let tokens = Tokenizer::new(&GenericDialect {}, "arr[1].2")
.tokenize()
.unwrap();
assert_eq!(
tokens,
vec![
Token::make_word("arr", None),
Token::LBracket,
Token::Number("1".to_string(), false),
Token::RBracket,
Token::Number(".2".to_string(), false),
]
);
}

#[test]
fn tokenize_period_underscore() {
let sql = String::from("SELECT table._col");
Expand Down
23 changes: 23 additions & 0 deletions tests/sqlparser_clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,29 @@ fn parse_map_access_expr() {
);
}

#[test]
fn parse_tuple_element_access() {
// Single-level access on an array of tuples.
let sql = "SELECT arr[1].1 FROM t";
let select = clickhouse().verified_only_select(sql);
assert_eq!(
&Expr::CompoundFieldAccess {
root: Box::new(Expr::Identifier(Ident::new("arr"))),
access_chain: vec![
AccessExpr::Subscript(Subscript::Index {
index: Expr::value(Value::Number("1".parse().unwrap(), false)),
}),
AccessExpr::Dot(Expr::value(Value::Number("1".parse().unwrap(), false))),
],
},
expr_from_projection(only(&select.projection))
);

clickhouse().verified_stmt("SELECT t.1 FROM x");
clickhouse().verified_stmt("SELECT (1, 2, 3).2");
clickhouse().verified_stmt("SELECT arr[1].1.2 FROM t");
}

#[test]
fn parse_array_expr() {
let sql = "SELECT ['1', '2'] FROM test";
Expand Down