diff --git a/.bazelrc b/.bazelrc
index d92f9cfa..16a65cf7 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -20,6 +20,9 @@
# widely accepted by compilers. This may lead to strange behavior or compiler
# errors in earlier compilers.
build --cxxopt="-std=c++1z"
+build --sandbox_debug --verbose_failures
+build --cxxopt="-DNDEBUG"
+build --workspace_status_command "tools/workspace_status.sh"
# By default, we don't suppress any warnings, to get clang-specific warning
# suppression you can invoke with --config=clang
build:clang --cxxopt=-Wno-deprecated-declarations
diff --git a/.bazelversion b/.bazelversion
index fcdb2e10..ee74734a 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-4.0.0
+4.1.0
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 00000000..a61c0854
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,76 @@
+name: Build
+on: push
+env:
+ cache-version: v1.0.2
+jobs:
+ linux:
+ name: Test the repository on Linux
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout the repository
+ uses: actions/checkout@v2
+ with:
+ ref: ${{ github.ref }}
+ fetch-depth: 0
+ - name: Cache
+ uses: pat-s/always-upload-cache@v2.1.5
+ with:
+ path: ~/.cache/bazel
+ key: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build-${{ hashFiles('./**') }}
+ restore-keys: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build-
+ - name: Setup
+ run: |
+ sudo apt-get update
+ sudo apt-get install --no-install-recommends -y gcc-9 g++-9
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 900 \
+ --slave /usr/bin/g++ g++ /usr/bin/g++-9
+ bazelisk test --test_output=errors //zetasql/public:sql_formatter_test
+ bazelisk build //zetasql/tools/zetasql-formatter:format
+ sudo cp bazel-bin/zetasql/tools/zetasql-formatter/format zetasql-formatter
+ zip zetasql-formatter_linux_x86_64.zip zetasql-formatter
+ - name: Test
+ run: |
+ cd zetasql/tools/zetasql-formatter
+ ls example_tests | xargs -n1 -I {} sh -c 'cat example_tests/{} | ../../../zetasql-formatter > example_tests_formatted/{}'
+ git diff --exit-code -- '*.sql'
+ - name: Release
+ uses: softprops/action-gh-release@v1
+ if: startsWith(github.ref, 'refs/tags/')
+ with:
+ files: zetasql-formatter_linux_x86_64.zip
+ prerelease: true
+ generate_release_notes: true
+ macos:
+ name: Test the repository
+ runs-on: macos-10.15
+ steps:
+ - name: Checkout the repository
+ uses: actions/checkout@v2
+ with:
+ ref: ${{ github.ref }}
+ fetch-depth: 0
+ - name: Cache
+ uses: pat-s/always-upload-cache@v2.1.5
+ with:
+ path: ~/.cache/bazel
+ key: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build-${{ hashFiles('./**') }}
+ restore-keys: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build-
+ - name: Setup
+ run: |
+ export TEST_TMPDIR=~/.cache/bazel
+ CC=g++ bazelisk test --test_output=errors //zetasql/public:sql_formatter_test
+ CC=g++ bazelisk build //zetasql/tools/zetasql-formatter:format
+ sudo cp bazel-bin/zetasql/tools/zetasql-formatter/format zetasql-formatter
+ zip zetasql-formatter_darwin_amd64.zip zetasql-formatter
+ - name: Test
+ run: |
+ cd zetasql/tools/zetasql-formatter
+ ls example_tests | xargs -n1 -I {} sh -c 'cat example_tests/{} | ../../../zetasql-formatter > example_tests_formatted/{}'
+ git diff --exit-code -- '*.sql'
+ - name: Release
+ uses: softprops/action-gh-release@v1
+ if: startsWith(github.ref, 'refs/tags/')
+ with:
+ files: zetasql-formatter_darwin_amd64.zip
+ prerelease: true
+ generate_release_notes: true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..85703949
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+/external
+/bazel-*
+/compile_commands.json
+/.cache/
diff --git a/README.md b/README.md
index 89315a68..73751933 100644
--- a/README.md
+++ b/README.md
@@ -1,89 +1,70 @@
-## ZetaSQL - Analyzer Framework for SQL
-
-ZetaSQL defines a language (grammar, types, data model, and semantics) as well
-as a parser and analyzer. It is not itself a database or query engine. Instead
-it is intended to be used by multiple engines wanting to provide consistent
-behavior for all semantic analysis, name resolution, type checking, implicit
-casting, etc. Specific query engines may not implement all features in the
-ZetaSQL language and may give errors if specific features are not supported. For
-example, engine A may not support any updates and engine B may not support
-analytic functions.
-
-[ZetaSQL Language Guide](docs/README.md)
-
-[ZetaSQL ResolvedAST API](docs/resolved_ast.md)
-
-## Status of Project and Roadmap
-
-This codebase is being open sourced in multiple phases:
-
-1. Parser and Analyzer **Complete**
- - Initial release includes only a subset of tests
-2. Reference Implementation **In Progress**
- - Base capability **Complete**
- - Function library **In Progress**
-3. Compliance Tests **Complete**
- - includes framework for validating compliance of arbitrary engines
-4. Misc tooling
- - Improved Formatter **In Progress**
-
-Multiplatform support is planned for the following platforms:
-
- - Linux (Ubuntu 1804 _with gcc8_ is our reference platform, but others may work).
- - MacOS (Experimental)
- - Windows (version TDB)
-
-Until all this code is released, we cannot provide any guarantees of API
-stability and cannot accept contributions. We will also be releasing more
-documentation over time, particular related to developing engines with this
-framework. Documentation on the [language](docs/) itself is fairly
-complete.
-
-
-## Flags
-ZetaSQL uses the Abseil [Flags](https://abseil.io/blog/20190509-flags) library
-to handle commandline flags. Unless otherwise documented, all flags are for
-debugging purposes only and may change, stop working or be removed at any time.
-
-
-## How to Build
-
-ZetaSQL uses [bazel](https://bazel.build) for building and dependency
-resolution. After installing bazel (we maintain support for 1.0,
-but other versions may work), simply run:
-
-```bazel build ...```
-
-## How to add as a Dependency in bazel
-See the (WORKSPACE) file, as it is a little unusual.
-
-### With docker
- TODO: Add docker build script.
-
-## Example Usage
-A very basic command line tool is available to run simple queries with the
-reference implementation:
-```bazel run //zetasql/tools/execute_query:execute_query -- "select 1 + 1;"```
-
-The reference implementation is not yet completely released and currently
-supports only a subset of functions and types.
-
-## Differential Privacy
-For questions, documentation and examples of ZetaSQLs implementation of
-Differential Privacy, please check out
-(https://github.com/google/differential-privacy).
-
-## Versions
-
-ZetaSQL makes no guarantees regarding compatibility between releases.
-Breaking changes may be made at any time. Our releases are numbered based
-on the date of the commit the release is cut from. The number format is
-YYYY.MM.n, where YYYY is the year, MM is the two digit month, and n is a
-sequence number within the time period.
+## ZetaSQL Formatter
+
+[![build](https://github.com/Matts966/zetasql-formatter/workflows/Build/badge.svg?branch=main)](https://github.com/Matts966/zetasql-formatter/actions?query=branch%main+workflow%3ABuild+)
+
+
+
+
+
+This repository is forked from [google/zetasql](https://github.com/google/zetasql) and provides SQL formatter with preserved comments. This formatter can be applied to mainly BigQuery and SpanSQL.
+
+## Quick Start
+
+```bash
+# To install for MacOSX
+wget https://github.com/Matts966/zetasql-formatter/releases/latest/download/zetasql-formatter_darwin_amd64.zip \
+ && sudo unzip zetasql-formatter_darwin_amd64.zip -d /usr/local/bin
+```
+
+```bash
+# To install for Linux
+wget https://github.com/Matts966/zetasql-formatter/releases/latest/download/zetasql-formatter_linux_x86_64.zip \
+ && sudo unzip zetasql-formatter_linux_x86_64.zip -d /usr/local/bin
+```
+
+```bash
+# To apply formatter for files
+$ zetasql-formatter [files and directories]
+
+# Format stdin
+$ echo "select * from test" | zetasql-formatter
+SELECT
+ *
+FROM
+ test;
+
+$ zetasql-formatter
+select * from ok;
+-- CTRL-D
+SELECT
+ *
+FROM
+ ok;
+-- CTRL-D
+```
+
+## Integration with [efm-langserver](https://github.com/mattn/efm-langserver)
+
+- Install efm-langserver
+- Locate [`config.yaml`](https://github.com/mattn/efm-langserver#example-for-configyaml) like below
+
+```yaml
+version: 2
+tools:
+ zetasql-formatter: &zetasql-formatter
+ format-command: zetasql-formatter
+ format-stdin: true
+languages:
+ sql:
+ - <<: *zetasql-formatter
+ sql-bigquery:
+ - <<: *zetasql-formatter
+```
## License
[Apache License 2.0](LICENSE)
-## Support Disclaimer
-This is not an officially supported Google product.
+## Sponsors
+
+The development of this formatter is sponsored by the Japan Data Science Consortium.
diff --git a/WORKSPACE b/WORKSPACE
index e44134e8..bec816cd 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -92,3 +92,25 @@ load("@com_google_zetasql//bazel:zetasql_deps_step_4.bzl", "zetasql_deps_step_4"
zetasql_deps_step_4()
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+git_repository(
+ name = "com_github_gflags_gflags",
+ remote = "https://github.com/gflags/gflags.git",
+ tag = "v2.2.2"
+)
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+# Hedron's Compile Commands Extractor for Bazel
+# https://github.com/hedronvision/bazel-compile-commands-extractor
+http_archive(
+ name = "hedron_compile_commands",
+
+ # Replace the commit hash in both places (below) with the latest, rather than using the stale one here.
+ # Even better, set up Renovate and let it do the work for you (see "Suggestion: Updates" in the README).
+ url = "https://github.com/hedronvision/bazel-compile-commands-extractor/archive/af9af15f7bc16fc3e407e2231abfcb62907d258f.tar.gz",
+ strip_prefix = "bazel-compile-commands-extractor-af9af15f7bc16fc3e407e2231abfcb62907d258f",
+ # When you first run this tool, it'll recommend a sha256 hash to put here with a message like: "DEBUG: Rule 'hedron_compile_commands' indicated that a canonical reproducible form can be obtained by modifying arguments sha256 = ..."
+)
+load("@hedron_compile_commands//:workspace_setup.bzl", "hedron_compile_commands_setup")
+hedron_compile_commands_setup()
diff --git a/docs/changes.png b/docs/changes.png
new file mode 100644
index 00000000..d20b1fd5
Binary files /dev/null and b/docs/changes.png differ
diff --git a/tools/BUILD b/tools/BUILD
index dd0fd8e3..9a890b32 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -24,6 +24,7 @@ package(
exports_files([
"pom-template.xml",
+ "workspace_status.sh",
])
bzl_library(
diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh
new file mode 100755
index 00000000..e68fe3b6
--- /dev/null
+++ b/tools/workspace_status.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+echo "STABLE_BUILD_GIT_DESCRIBE $(git describe --tags)"
diff --git a/zetasql/parser/gen_extra_files.py b/zetasql/parser/gen_extra_files.py
index 6c1c4a72..e40244bd 100644
--- a/zetasql/parser/gen_extra_files.py
+++ b/zetasql/parser/gen_extra_files.py
@@ -70,6 +70,8 @@ class ParseTreeVisitor {
public:
virtual ~ParseTreeVisitor() {}
virtual void visit(const ASTNode *node, void* data) = 0;
+ virtual void visitStart(const ASTNode *node, void* data) {};
+ virtual void visitEnd(const ASTNode *node, void* data) {};
''')
for cls in concrete_classes:
yield (' virtual void visit{0}(const {0}* node, void* data) = 0;\n\n'
@@ -153,7 +155,9 @@ def GeneerateParseTreeAcceptMethods(
for cls in concrete_classes:
yield textwrap.dedent('''\
void {0}::Accept(ParseTreeVisitor* visitor, void* data) const {{
+ visitor->visitStart(this, data);
visitor->visit{0}(this, data);
+ visitor->visitEnd(this, data);
}}
''').format(cls)
diff --git a/zetasql/parser/parser.h b/zetasql/parser/parser.h
index c701b036..7cb166b4 100644
--- a/zetasql/parser/parser.h
+++ b/zetasql/parser/parser.h
@@ -22,6 +22,7 @@
#include
#include
#include
+#include
#include "zetasql/base/arena.h"
#include "zetasql/parser/ast_node_kind.h"
@@ -274,6 +275,8 @@ absl::Status ParseExpression(const ParseResumeLocation& resume_location,
// Unparse a given AST back to a canonical SQL string and return it.
// Works for any AST node.
std::string Unparse(const ASTNode* root);
+std::string UnparseWithComments(const ASTNode* root, std::deque>& parse_tokens);
// Parse the first few keywords from (ignoring whitespace, comments and
// hints) to determine what kind of statement it is (if it is valid).
diff --git a/zetasql/parser/unparser.cc b/zetasql/parser/unparser.cc
index 837928a4..7069872b 100644
--- a/zetasql/parser/unparser.cc
+++ b/zetasql/parser/unparser.cc
@@ -32,6 +32,7 @@
#include "absl/flags/flag.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
+#include "absl/strings/strip.h"
#include "absl/strings/string_view.h"
#include "zetasql/base/map_util.h"
@@ -47,6 +48,20 @@ std::string Unparse(const ASTNode* node) {
return unparsed_;
}
+std::string UnparseWithComments(const ASTNode* node, std::deque>& parse_tokens) {
+ std::string unparsed_;
+ parser::Unparser unparser(&unparsed_);
+ // Print comments by visitors and pop.
+ node->Accept(&unparser, &parse_tokens);
+ // Emit left comments in parse_tokens.
+ for (const auto& parse_token : parse_tokens) {
+ unparser.print(parse_token.first);
+ }
+ unparser.FlushLine();
+ return unparsed_;
+}
+
namespace parser {
// Formatter ---------------------------------------------------------
@@ -172,6 +187,53 @@ void Formatter::FlushLine() {
buffer_.clear();
}
+void Formatter::EndStatement() {
+ if (last_token_is_comment) {
+ FormatLine("");
+ FormatLine(";");
+ } else {
+ // The result from Unparse always ends with '\n'. Strips whitespaces so ';'
+ // can follow the statement immediately rather than starting a new line.
+ absl::StrAppend(unparsed_, buffer_);
+ buffer_.clear();
+ absl::StripAsciiWhitespace(unparsed_);
+ FormatLine(";");
+ }
+}
+
+// FlushCommentsPassedBy prints comments if they are before the given ParseLocationPoint
+// and returns if comments are emitted.
+bool Formatter::FlushCommentsPassedBy(const ParseLocationPoint point, void* data) {
+ if (data == nullptr) return false;
+ auto parse_tokens = static_cast>*>(data);
+ // Always !nullptr.
+ /* if (parse_tokens == nullptr) return false; */
+ last_token_is_comment = false;
+ const int size = parse_tokens->size();
+ for (int i = 0; i < size; i++) {
+ if (parse_tokens->front().second >= point) {
+ break;
+ }
+ absl::string_view comment_string_view(parse_tokens->front().first);
+ absl::ConsumeSuffix(&comment_string_view, "\r\n");
+ absl::ConsumeSuffix(&comment_string_view, "\r");
+ absl::ConsumeSuffix(&comment_string_view, "\n");
+ std::string comment_string = std::string(comment_string_view);
+ parse_tokens->pop_front();
+
+ // println if multi-line comments
+ if (!last_token_is_comment && i + 1 < size) {
+ if (parse_tokens->front().second < point) {
+ FlushLine();
+ }
+ }
+
+ FormatLine(comment_string);
+ last_token_is_comment = true;
+ }
+ return last_token_is_comment;
+}
+
// Unparser -------------------------------------------------------------------
// Helper functions.
@@ -215,9 +277,10 @@ void Unparser::UnparseLeafNode(const ASTLeaf* leaf_node) {
void Unparser::UnparseChildrenWithSeparator(const ASTNode* node, void* data,
const std::string& separator,
- bool break_line) {
+ bool break_line,
+ bool separator_first) {
UnparseChildrenWithSeparator(node, data, 0, node->num_children(), separator,
- break_line);
+ break_line, separator_first);
}
// Unparse children of from indices in the range [, )
@@ -225,11 +288,17 @@ void Unparser::UnparseChildrenWithSeparator(const ASTNode* node, void* data,
void Unparser::UnparseChildrenWithSeparator(const ASTNode* node, void* data,
int begin, int end,
const std::string& separator,
- bool break_line) {
+ bool break_line,
+ bool separator_first) {
for (int i = begin; i < end; i++) {
if (i > begin) {
if (break_line) {
- println(separator);
+ if (separator_first) {
+ println();
+ print(separator);
+ } else {
+ println(separator);
+ }
} else {
print(separator);
}
@@ -1300,6 +1369,8 @@ void Unparser::visitASTQuery(const ASTQuery* node, void* data) {
visitASTChildren(node, data);
}
PrintCloseParenIfNeeded(node);
+ // ASTQuery always ends parent.
+ formatter_.FlushCommentsPassedBy(node->parent()->GetParseLocationRange().end(), data);
}
void Unparser::visitASTSetOperation(const ASTSetOperation* node, void* data) {
@@ -1547,10 +1618,29 @@ void Unparser::visitASTGroupBy(const ASTGroupBy* node, void* data) {
if (node->hint() != nullptr) {
node->hint()->Accept(this, data);
}
- print("BY");
+ println("BY");
{
Formatter::Indenter indenter(&formatter_);
- UnparseVectorWithSeparator(node->grouping_items(), data, ",");
+ std::vector int_grouping_items;
+ std::vector other_grouping_items;
+ for (const auto grouping_item : node->grouping_items()) {
+ const auto& expr = grouping_item->expression();
+ if (expr == nullptr) {
+ other_grouping_items.push_back(grouping_item);
+ continue;
+ }
+ const auto& int_item = expr->GetAsOrNull();
+ if (int_item == nullptr) {
+ other_grouping_items.push_back(grouping_item);
+ continue;
+ }
+ int_grouping_items.push_back(int_item);
+ }
+ UnparseVectorWithSeparator(absl::Span(int_grouping_items), data, ",");
+ if (int_grouping_items.size() > 0 && other_grouping_items.size() > 0) {
+ println(",");
+ }
+ UnparseVectorWithSeparator(absl::Span(other_grouping_items), data, ",\n");
}
}
@@ -1623,8 +1713,18 @@ void Unparser::visitASTHavingModifier(const ASTHavingModifier* node,
void Unparser::visitASTClampedBetweenModifier(
const ASTClampedBetweenModifier* node, void* data) {
println();
- print("CLAMPED BETWEEN");
- UnparseChildrenWithSeparator(node, data, 0, node->num_children(), "AND");
+ {
+ Formatter::Indenter indenter(&formatter_);
+ println();
+ print("CLAMPED BETWEEN");
+ for (int i = 0; i < node->num_children(); i++) {
+ node->child(i)->Accept(this, data);
+ if (i < node->num_children() - 1) {
+ println();
+ print("AND");
+ }
+ }
+ }
}
void Unparser::visitASTWithReportModifier(const ASTWithReportModifier* node,
@@ -1842,7 +1942,11 @@ void Unparser::visitASTStar(const ASTStar* node, void* data) {
void Unparser::visitASTStarExceptList(const ASTStarExceptList* node,
void* data) {
- UnparseChildrenWithSeparator(node, data, ",");
+ println();
+ {
+ Formatter::Indenter indenter(&formatter_);
+ UnparseChildrenWithSeparator(node, data, ",", true /* break_line */);
+ }
}
void Unparser::visitASTStarReplaceItem(const ASTStarReplaceItem* node,
@@ -1852,14 +1956,27 @@ void Unparser::visitASTStarReplaceItem(const ASTStarReplaceItem* node,
void Unparser::visitASTStarModifiers(const ASTStarModifiers* node, void* data) {
if (node->except_list() != nullptr) {
- print("EXCEPT (");
- node->except_list()->Accept(this, data);
- print(")");
+ println();
+ {
+ Formatter::Indenter indenter(&formatter_);
+ println("EXCEPT (");
+ node->except_list()->Accept(this, data);
+ println();
+ print(")");
+ }
}
if (!node->replace_items().empty()) {
- print("REPLACE (");
- UnparseVectorWithSeparator(node->replace_items(), data, ",");
- print(")");
+ println();
+ {
+ Formatter::Indenter indenter(&formatter_);
+ println("REPLACE (");
+ {
+ Formatter::Indenter indenter(&formatter_);
+ UnparseVectorWithSeparator(node->replace_items(), data, ",");
+ }
+ println();
+ print(")");
+ }
}
}
@@ -1936,13 +2053,13 @@ void Unparser::visitASTDotStarWithModifiers(
void Unparser::visitASTOrExpr(const ASTOrExpr* node, void* data) {
PrintOpenParenIfNeeded(node);
- UnparseChildrenWithSeparator(node, data, "OR");
+ UnparseChildrenWithSeparator(node, data, "OR", true, true);
PrintCloseParenIfNeeded(node);
}
void Unparser::visitASTAndExpr(const ASTAndExpr* node, void* data) {
PrintOpenParenIfNeeded(node);
- UnparseChildrenWithSeparator(node, data, "AND");
+ UnparseChildrenWithSeparator(node, data, "AND", true, true);
PrintCloseParenIfNeeded(node);
}
@@ -2140,9 +2257,19 @@ void Unparser::visitASTBetweenExpression(const ASTBetweenExpression* node,
void* data) {
PrintOpenParenIfNeeded(node);
node->child(0)->Accept(this, data);
- print(absl::StrCat(node->is_not() ? "NOT " : "", "BETWEEN"));
- UnparseChildrenWithSeparator(node, data, 1, node->num_children(), "AND");
- PrintCloseParenIfNeeded(node);
+ {
+ Formatter::Indenter indenter(&formatter_);
+ println();
+ print(absl::StrCat(node->is_not() ? "NOT " : "", "BETWEEN"));
+ for (int i = 1; i < node->num_children(); i++) {
+ node->child(i)->Accept(this, data);
+ if (i < node->num_children() - 1) {
+ println();
+ print("AND");
+ }
+ }
+ PrintCloseParenIfNeeded(node);
+ }
}
void Unparser::visitASTFunctionCall(const ASTFunctionCall* node, void* data) {
@@ -2455,12 +2582,15 @@ void Unparser::visitASTWindowFrame(const ASTWindowFrame* node,
void* data) {
print(node->GetFrameUnitString());
if (nullptr != node->end_expr()) {
+ Formatter::Indenter indenter(&formatter_);
+ println();
print("BETWEEN");
- }
- node->start_expr()->Accept(this, data);
- if (nullptr != node->end_expr()) {
+ node->start_expr()->Accept(this, data);
+ println();
print("AND");
node->end_expr()->Accept(this, data);
+ } else {
+ node->start_expr()->Accept(this, data);
}
}
@@ -3358,7 +3488,7 @@ void Unparser::visitASTCreateIndexStatement(const ASTCreateIndexStatement* node,
void Unparser::visitASTStatementList(const ASTStatementList* node, void* data) {
for (const ASTStatement* statement : node->statement_list()) {
statement->Accept(this, data);
- println(";");
+ formatter_.EndStatement();
}
}
diff --git a/zetasql/parser/unparser.h b/zetasql/parser/unparser.h
index 31a0ec15..693ad714 100644
--- a/zetasql/parser/unparser.h
+++ b/zetasql/parser/unparser.h
@@ -89,6 +89,10 @@ class Formatter {
// some content remains in buffer_.
void FlushLine();
+ bool last_token_is_comment = false;
+ void EndStatement();
+ bool FlushCommentsPassedBy(const ParseLocationPoint point, void* data);
+
private:
// Checks if last token in buffer_ is a separator, where it is appropriate to
// insert a line break or a space before open paren.
@@ -126,13 +130,23 @@ class Unparser : public ParseTreeVisitor {
}
void visitASTChildren(const ASTNode* node, void* data) {
+ formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().start(), data);
node->ChildrenAccept(this, data);
+ formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().end(), data);
}
void visit(const ASTNode* node, void* data) override {
visitASTChildren(node, data);
}
+ void visitStart(const ASTNode *node, void* data) override {
+ formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().start(), data);
+ };
+
+ void visitEnd(const ASTNode *node, void* data) override {
+ formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().end(), data);
+ };
+
// Shorthand for calling methods in formatter_.
void print(absl::string_view s) { formatter_.Format(s); }
@@ -700,11 +714,13 @@ class Unparser : public ParseTreeVisitor {
// Set break_line to true if you want to print each child on a separate line.
virtual void UnparseChildrenWithSeparator(const ASTNode* node, void* data,
const std::string& separator,
- bool break_line = false);
+ bool break_line = false,
+ bool separator_first = false);
virtual void UnparseChildrenWithSeparator(const ASTNode* node, void* data,
int begin, int end,
const std::string& separator,
- bool break_line = false);
+ bool break_line = false,
+ bool separator_first = false);
template
void UnparseVectorWithSeparator(
diff --git a/zetasql/public/parse_location.h b/zetasql/public/parse_location.h
index b19772b6..6334e401 100644
--- a/zetasql/public/parse_location.h
+++ b/zetasql/public/parse_location.h
@@ -104,6 +104,14 @@ class ParseLocationPoint {
return lhs.filename_ < rhs.filename_;
}
+ friend bool operator>=(const ParseLocationPoint& lhs,
+ const ParseLocationPoint& rhs) {
+ if (lhs.filename_ == rhs.filename_) {
+ return lhs.byte_offset_ >= rhs.byte_offset_;
+ }
+ return lhs.filename_ >= rhs.filename_;
+ }
+
friend std::ostream& operator<<(std::ostream& os,
const ParseLocationPoint& point) {
return os << "ParseLocationPoint at offset " << point.GetByteOffset();
diff --git a/zetasql/public/sql_formatter.cc b/zetasql/public/sql_formatter.cc
index 7ee9e8ec..543c0fcf 100644
--- a/zetasql/public/sql_formatter.cc
+++ b/zetasql/public/sql_formatter.cc
@@ -17,8 +17,8 @@
#include "zetasql/public/sql_formatter.h"
#include
-#include
#include
+#include
#include "zetasql/base/logging.h"
#include "zetasql/parser/parse_tree.h"
@@ -44,80 +44,34 @@ absl::Status FormatSql(absl::string_view sql, std::string* formatted_sql) {
*formatted_sql = std::string(sql);
- std::vector formatted_statement;
+ ParseTokenOptions options;
+ options.include_comments = true;
+ LanguageOptions language_options;
+ language_options.EnableMaximumLanguageFeaturesForDevelopment();
+ options.language_options = language_options;
- ParseResumeLocation location = ParseResumeLocation::FromStringView(sql);
- bool at_end_of_input = false;
- absl::Status return_status = absl::OkStatus();
- while (!at_end_of_input) {
- std::unique_ptr parser_output;
- LanguageOptions language_options;
- language_options.EnableMaximumLanguageFeaturesForDevelopment();
- const absl::Status status =
- ParseNextStatement(&location, ParserOptions(language_options),
- &parser_output, &at_end_of_input);
-
- if (status.ok()) {
- formatted_statement.push_back(Unparse(parser_output->statement()));
- } else {
- const absl::Status out_status = MaybeUpdateErrorFromPayload(
- ErrorMessageMode::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, sql, status);
- if (return_status.ok()) {
- return_status = out_status;
- } else {
- return_status = ::zetasql_base::StatusBuilder(return_status).SetAppend()
- << "\n"
- << FormatError(out_status);
- }
+ std::unique_ptr parser_output;
- // When statement is not parseable, we proceed to the next semicolon and
- // just emit the original string in between.
- std::vector parse_tokens;
- ParseTokenOptions options;
- options.language_options = language_options;
- options.stop_at_end_of_statement = true;
- const int statement_start = location.byte_position();
- const absl::Status token_status =
- GetParseTokens(options, &location, &parse_tokens);
- // If GetParseTokens fails, just returns the original sql since there's no
- // way to proceed forward.
- if (!token_status.ok()) {
- return MaybeUpdateErrorFromPayload(
- ErrorMessageMode::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, sql,
- token_status);
- }
- // GetParseTokens() reads until either a semicolon or end of input.
- if (parse_tokens.back().IsEndOfInput()) {
- // When there's trailing whitespace or comment after the last
- // semicolon, parse_tokens will be one END_OF_INPUT token.
- // It should not be treated as a statement. If there's more than one
- // token, then we treat the remainder of the input as a statement.
- if (parse_tokens.size() != 1) {
- formatted_statement.push_back(
- std::string(sql.substr(statement_start)));
- }
- at_end_of_input = true;
- } else {
- // The last token parsed must be a semicolon. Do not include it, because
- // we will add one later.
- ZETASQL_RET_CHECK_EQ(parse_tokens.back().GetKeyword(), ";");
- const int statement_length =
- parse_tokens.back().GetLocationRange().start().GetByteOffset() -
- statement_start;
- formatted_statement.push_back(
- std::string(sql.substr(statement_start, statement_length)));
+ ZETASQL_RETURN_IF_ERROR(ParseScript(sql, ParserOptions(language_options),
+ ErrorMessageMode::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, &parser_output));
+ std::deque> comments;
+ std::vector parse_tokens;
+ ParseResumeLocation location = ParseResumeLocation::FromStringView(sql);
+ const absl::Status token_status =
+ GetParseTokens(options, &location, &parse_tokens);
+ if (token_status.ok()) {
+ for (const auto& parse_token : parse_tokens) {
+ if (parse_token.IsComment()) {
+ comments.push_back(std::make_pair(parse_token.GetSQL(), parse_token.GetLocationRange().start()));
}
}
+ *formatted_sql = UnparseWithComments(parser_output->script(), comments);
+ } else {
+ // If GetParseTokens fails, just ignores comments.
+ *formatted_sql = Unparse(parser_output->script());
}
- // The result from Unparse always ends with '\n'. Strips whitespaces so ';'
- // can follow the statement immediately rather than starting a new line.
- for (auto& e : formatted_statement) {
- absl::StripAsciiWhitespace(&e);
- }
-
- *formatted_sql = absl::StrCat(absl::StrJoin(formatted_statement, ";\n"), ";");
- return return_status;
+ return absl::OkStatus();
}
} // namespace zetasql
diff --git a/zetasql/public/sql_formatter_test.cc b/zetasql/public/sql_formatter_test.cc
index 5a7b77d7..5f5bd828 100644
--- a/zetasql/public/sql_formatter_test.cc
+++ b/zetasql/public/sql_formatter_test.cc
@@ -34,19 +34,19 @@ TEST(SqlFormatterTest, ValidSingleStatement) {
// Without semicolon.
ZETASQL_ASSERT_OK(FormatSql("select a", &formatted_sql));
EXPECT_EQ("SELECT\n"
- " a;",
+ " a;\n",
formatted_sql);
// With semicolon and trailing whitespaces.
ZETASQL_ASSERT_OK(FormatSql(" select a ; \t ", &formatted_sql));
EXPECT_EQ("SELECT\n"
- " a;",
+ " a;\n",
formatted_sql);
// With semicolon and trailing comment.
ZETASQL_ASSERT_OK(FormatSql(" select a ; # foo", &formatted_sql));
EXPECT_EQ("SELECT\n"
- " a;",
+ " a;\n# foo\n",
formatted_sql);
}
@@ -60,7 +60,7 @@ TEST(SqlFormatterTest, InvalidSingleStatement) {
&formatted_sql),
StatusIs(_, HasSubstr("Syntax error: Expected end of input but "
"got keyword HAVING [at 1:36]")));
- EXPECT_EQ("select f1 as a from T having a > 5 having a > 5;",
+ EXPECT_EQ("select f1 as a from T having a > 5 having a > 5",
formatted_sql);
// With semicolon as the last char.
@@ -76,29 +76,21 @@ TEST(SqlFormatterTest, InvalidSingleStatement) {
&formatted_sql),
StatusIs(_, HasSubstr("Syntax error: Expected end of input but "
"got keyword HAVING [at 1:36]")));
- EXPECT_EQ("select f1 as a from T having a > 5 having a > 5;",
+ EXPECT_EQ("select f1 as a from T having a > 5 having a > 5; ",
formatted_sql);
// With semicolon and trailing comment.
EXPECT_THAT(
FormatSql("select f1 as a from T having a > 5 having a > 5; # foo",
&formatted_sql),
- StatusIs(_,
- HasSubstr(
- "Syntax error: Expected end of input but got keyword HAVING "
- "[at 1:36]\n"
- "select f1 as a from T having a > 5 having a > 5; # foo\n"
- " ^\n"
- "Syntax error: Unexpected end of statement [at 1:55]\n"
- "select f1 as a from T having a > 5 having a > 5; # foo\n"
- " ^")));
- EXPECT_EQ("select f1 as a from T having a > 5 having a > 5;",
+ StatusIs(_, _));
+ EXPECT_EQ("select f1 as a from T having a > 5 having a > 5; # foo",
formatted_sql);
// Empty statement.
EXPECT_THAT(
FormatSql(";", &formatted_sql),
- StatusIs(_, HasSubstr("Syntax error: Unexpected \";\" [at 1:1]")));
+ StatusIs(_, _));
EXPECT_EQ(";", formatted_sql);
// Semicolon in string.
@@ -123,7 +115,7 @@ TEST(SqlFormatterTest, ValidMultipleStatements) {
"SELECT\n"
" a\n"
"FROM\n"
- " t1;",
+ " t1;\n",
formatted_sql);
ZETASQL_ASSERT_OK(FormatSql("select 1;\n"
@@ -131,7 +123,7 @@ TEST(SqlFormatterTest, ValidMultipleStatements) {
EXPECT_EQ("SELECT\n"
" 1;\n"
"SELECT\n"
- " 2;",
+ " 2;\n",
formatted_sql);
}
@@ -147,30 +139,16 @@ TEST(SqlFormatterTest, InvalidMultipleStatements) {
" drop foo.bar; define table t1 (a=1,b=\"a\",c=1.4,d=true) ;\n"
" select sum(f1) as a from T having a > 5 having a > 5;select 1",
&formatted_sql),
- StatusIs(
- _,
- HasSubstr(
- "foo is not a supported object type [at 1:7]\n"
- " drop foo.bar; define table t1 (a=1,b=\"a\",c=1.4,d=true) ;\n"
- " ^\n"
- "Syntax error: Expected end of input but got keyword HAVING [at "
- "2:42]\n"
- " select sum(f1) as a from T having a > 5 having a > 5;select 1\n"
- " ^")));
- EXPECT_EQ("drop foo.bar;\n"
- "DEFINE TABLE t1(a = 1, b = \"a\", c = 1.4, d = true);\n"
- "select sum(f1) as a from T having a > 5 having a > 5;\n"
- "SELECT\n"
- " 1;",
+ StatusIs(_, _));
+ EXPECT_EQ(" drop foo.bar; define table t1 (a=1,b=\"a\",c=1.4,d=true) ;\n"
+ " select sum(f1) as a from T having a > 5 having a > 5;select 1",
formatted_sql);
// The second statement is an invalid empty statement.
EXPECT_THAT(
FormatSql("select 1; ;", &formatted_sql),
- StatusIs(_, HasSubstr("Syntax error: Unexpected \";\" [at 1:12]")));
- EXPECT_EQ("SELECT\n"
- " 1;\n"
- ";",
+ StatusIs(_, _));
+ EXPECT_EQ("select 1; ;",
formatted_sql);
// The second statement contains invalid input character '$', which makes
@@ -184,5 +162,48 @@ TEST(SqlFormatterTest, InvalidMultipleStatements) {
EXPECT_EQ("select 1; select $d ;", formatted_sql);
}
+TEST(SqlFormatterTest, Script) {
+ std::string formatted_sql;
+ ZETASQL_ASSERT_OK(FormatSql("BEGIN\nEND\n", &formatted_sql));
+ EXPECT_EQ("BEGIN\n"
+ "END;\n",
+ formatted_sql);
+}
+
+TEST(SqlFormatterTest, Pivot) {
+ std::string formatted_sql;
+ ZETASQL_ASSERT_OK(FormatSql("SELECT *\nFROM a\nPIVOT(AVG(b) FOR c IN ('d', 'e'))\n", &formatted_sql));
+ EXPECT_EQ("SELECT\n *\nFROM\n a PIVOT(AVG(b) FOR c IN ('d', 'e'));\n",
+ formatted_sql);
+}
+
+TEST(SqlFormatterTest, Comment) {
+ std::string formatted_sql;
+ ZETASQL_ASSERT_OK(FormatSql("SELECT * -- comment\nFROM a /* comment */\nPIVOT(AVG(b) FOR c IN ('d', 'e'))\n", &formatted_sql));
+ EXPECT_EQ("SELECT\n * -- comment\nFROM\n a /* comment */\n PIVOT(AVG(b) FOR c IN ('d', 'e'));\n",
+ formatted_sql);
+}
+
+TEST(SqlFormatterTest, SeparatorAndGroupBy) {
+ std::string query_string(
+ "SELECT\n"
+ " *\n"
+ "FROM\n"
+ " foo.bar_tab\n"
+ "WHERE\n"
+ " col1 = 'abc'\n"
+ " AND col2 > 10\n"
+ " AND col3 IS NOT NULL\n"
+ "GROUP BY\n"
+ " 0, 1,\n"
+ " x,\n"
+ " y,\n"
+ " z;\n");
+ std::string formatted_sql;
+ ZETASQL_ASSERT_OK(FormatSql(query_string, &formatted_sql));
+ EXPECT_EQ(query_string,
+ formatted_sql);
+}
+
} // namespace
} // namespace zetasql
diff --git a/zetasql/tools/zetasql-formatter/BUILD b/zetasql/tools/zetasql-formatter/BUILD
new file mode 100644
index 00000000..3d2525bb
--- /dev/null
+++ b/zetasql/tools/zetasql-formatter/BUILD
@@ -0,0 +1,32 @@
+package(
+ default_visibility = ["//zetasql/base:zetasql_implementation"],
+)
+
+genrule(
+ name = "gen_version",
+ outs = ["version.h"],
+ cmd = """
+TAG=$$(sed -n 's/STABLE_BUILD_GIT_DESCRIBE //p' bazel-out/stable-status.txt)
+cat > $@ <
+#include
+#include
+#include
+#include
+#include
+
+#include "zetasql/base/logging.h"
+#include "zetasql/base/status.h"
+#include "zetasql/public/sql_formatter.h"
+#include "absl/strings/strip.h"
+#include "absl/strings/str_join.h"
+#include "gflags/gflags.h"
+#include "zetasql/tools/zetasql-formatter/version.h"
+
+int format(const std::filesystem::path& file_path) {
+ std::string formatted;
+ if (file_path.extension() == ".bq" || file_path.extension() == ".sql") {
+ std::cout << "formatting " << file_path << "..." << std::endl;
+ std::ifstream file(file_path, std::ios::in);
+ std::string sql(std::istreambuf_iterator(file), {});
+ const absl::Status status = zetasql::FormatSql(sql, &formatted);
+ if (status.ok()) {
+ std::ofstream out(file_path);
+ out << formatted;
+ if (formatted != sql) {
+ std::cout << "successfully formatted " << file_path << "!" << std::endl;
+ return 1;
+ }
+ } else {
+ std::cout << "ERROR: " << status << std::endl;
+ return 1;
+ }
+ std::cout << file_path << " is already formatted!" << std::endl;
+ }
+ return 0;
+}
+
+// format formats all sql files in specified directory and returns code 0
+// if all files are formatted and 1 if error occurs or any file is formatted.
+int main(int argc, char* argv[]) {
+ const auto kUsage = "Usage: zetasql-formatter ";
+ gflags::SetUsageMessage(kUsage);
+ gflags::SetVersionString(ZSQL_FMT_VERSION_STRING);
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ if (argc <= 1) {
+ std::istreambuf_iterator begin(std::cin), end;
+ std::string sql(begin, end);
+ std::string formatted;
+ const absl::Status status = zetasql::FormatSql(sql, &formatted);
+ if (status.ok()) {
+ std::cout << formatted;
+ return 0;
+ }
+ std::cerr << "ERROR: " << status << std::endl;
+ return 1;
+ }
+ std::vector remaining_args(argv + 1, argv + argc);
+ int rc = 0;
+ for (const auto& path : remaining_args) {
+ if (std::filesystem::is_regular_file(path)) {
+ std::filesystem::path file_path(path);
+ return format(file_path);
+ }
+ std::filesystem::recursive_directory_iterator file_path(path,
+ std::filesystem::directory_options::skip_permission_denied)
+ , end;
+ std::error_code err;
+ for (; file_path != end; file_path.increment(err)) {
+ if (err) {
+ std::cout << "WARNING: " << err << std::endl;
+ }
+ rc |= format(file_path->path());
+ }
+ }
+ return rc;
+}