diff --git a/.bazelrc b/.bazelrc index d92f9cfa..16a65cf7 100644 --- a/.bazelrc +++ b/.bazelrc @@ -20,6 +20,9 @@ # widely accepted by compilers. This may lead to strange behavior or compiler # errors in earlier compilers. build --cxxopt="-std=c++1z" +build --sandbox_debug --verbose_failures +build --cxxopt="-DNDEBUG" +build --workspace_status_command "tools/workspace_status.sh" # By default, we don't suppress any warnings, to get clang-specific warning # suppression you can invoke with --config=clang build:clang --cxxopt=-Wno-deprecated-declarations diff --git a/.bazelversion b/.bazelversion index fcdb2e10..ee74734a 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -4.0.0 +4.1.0 diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 00000000..a61c0854 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,76 @@ +name: Build +on: push +env: + cache-version: v1.0.2 +jobs: + linux: + name: Test the repository on Linux + runs-on: ubuntu-latest + steps: + - name: Checkout the repository + uses: actions/checkout@v2 + with: + ref: ${{ github.ref }} + fetch-depth: 0 + - name: Cache + uses: pat-s/always-upload-cache@v2.1.5 + with: + path: ~/.cache/bazel + key: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build-${{ hashFiles('./**') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build- + - name: Setup + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends -y gcc-9 g++-9 + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 900 \ + --slave /usr/bin/g++ g++ /usr/bin/g++-9 + bazelisk test --test_output=errors //zetasql/public:sql_formatter_test + bazelisk build //zetasql/tools/zetasql-formatter:format + sudo cp bazel-bin/zetasql/tools/zetasql-formatter/format zetasql-formatter + zip zetasql-formatter_linux_x86_64.zip zetasql-formatter + - name: Test + run: | + cd zetasql/tools/zetasql-formatter + ls example_tests | xargs -n1 -I {} sh -c 'cat example_tests/{} | ../../../zetasql-formatter > example_tests_formatted/{}' + git diff --exit-code -- '*.sql' + - name: Release + uses: softprops/action-gh-release@v1 + if: startsWith(github.ref, 'refs/tags/') + with: + files: zetasql-formatter_linux_x86_64.zip + prerelease: true + generate_release_notes: true + macos: + name: Test the repository + runs-on: macos-10.15 + steps: + - name: Checkout the repository + uses: actions/checkout@v2 + with: + ref: ${{ github.ref }} + fetch-depth: 0 + - name: Cache + uses: pat-s/always-upload-cache@v2.1.5 + with: + path: ~/.cache/bazel + key: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build-${{ hashFiles('./**') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-bazelisk-build- + - name: Setup + run: | + export TEST_TMPDIR=~/.cache/bazel + CC=g++ bazelisk test --test_output=errors //zetasql/public:sql_formatter_test + CC=g++ bazelisk build //zetasql/tools/zetasql-formatter:format + sudo cp bazel-bin/zetasql/tools/zetasql-formatter/format zetasql-formatter + zip zetasql-formatter_darwin_amd64.zip zetasql-formatter + - name: Test + run: | + cd zetasql/tools/zetasql-formatter + ls example_tests | xargs -n1 -I {} sh -c 'cat example_tests/{} | ../../../zetasql-formatter > example_tests_formatted/{}' + git diff --exit-code -- '*.sql' + - name: Release + uses: softprops/action-gh-release@v1 + if: startsWith(github.ref, 'refs/tags/') + with: + files: zetasql-formatter_darwin_amd64.zip + prerelease: true + generate_release_notes: true diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..85703949 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/external +/bazel-* +/compile_commands.json +/.cache/ diff --git a/README.md b/README.md index 89315a68..73751933 100644 --- a/README.md +++ b/README.md @@ -1,89 +1,70 @@ -## ZetaSQL - Analyzer Framework for SQL - -ZetaSQL defines a language (grammar, types, data model, and semantics) as well -as a parser and analyzer. It is not itself a database or query engine. Instead -it is intended to be used by multiple engines wanting to provide consistent -behavior for all semantic analysis, name resolution, type checking, implicit -casting, etc. Specific query engines may not implement all features in the -ZetaSQL language and may give errors if specific features are not supported. For -example, engine A may not support any updates and engine B may not support -analytic functions. - -[ZetaSQL Language Guide](docs/README.md) - -[ZetaSQL ResolvedAST API](docs/resolved_ast.md) - -## Status of Project and Roadmap - -This codebase is being open sourced in multiple phases: - -1. Parser and Analyzer **Complete** - - Initial release includes only a subset of tests -2. Reference Implementation **In Progress** - - Base capability **Complete** - - Function library **In Progress** -3. Compliance Tests **Complete** - - includes framework for validating compliance of arbitrary engines -4. Misc tooling - - Improved Formatter **In Progress** - -Multiplatform support is planned for the following platforms: - - - Linux (Ubuntu 1804 _with gcc8_ is our reference platform, but others may work). - - MacOS (Experimental) - - Windows (version TDB) - -Until all this code is released, we cannot provide any guarantees of API -stability and cannot accept contributions. We will also be releasing more -documentation over time, particular related to developing engines with this -framework. Documentation on the [language](docs/) itself is fairly -complete. - - -## Flags -ZetaSQL uses the Abseil [Flags](https://abseil.io/blog/20190509-flags) library -to handle commandline flags. Unless otherwise documented, all flags are for -debugging purposes only and may change, stop working or be removed at any time. - - -## How to Build - -ZetaSQL uses [bazel](https://bazel.build) for building and dependency -resolution. After installing bazel (we maintain support for 1.0, -but other versions may work), simply run: - -```bazel build ...``` - -## How to add as a Dependency in bazel -See the (WORKSPACE) file, as it is a little unusual. - -### With docker - TODO: Add docker build script. - -## Example Usage -A very basic command line tool is available to run simple queries with the -reference implementation: -```bazel run //zetasql/tools/execute_query:execute_query -- "select 1 + 1;"``` - -The reference implementation is not yet completely released and currently -supports only a subset of functions and types. - -## Differential Privacy -For questions, documentation and examples of ZetaSQLs implementation of -Differential Privacy, please check out -(https://github.com/google/differential-privacy). - -## Versions - -ZetaSQL makes no guarantees regarding compatibility between releases. -Breaking changes may be made at any time. Our releases are numbered based -on the date of the commit the release is cut from. The number format is -YYYY.MM.n, where YYYY is the year, MM is the two digit month, and n is a -sequence number within the time period. +## ZetaSQL Formatter + +[![build](https://github.com/Matts966/zetasql-formatter/workflows/Build/badge.svg?branch=main)](https://github.com/Matts966/zetasql-formatter/actions?query=branch%main+workflow%3ABuild+) + +

+ +

+ +This repository is forked from [google/zetasql](https://github.com/google/zetasql) and provides SQL formatter with preserved comments. This formatter can be applied to mainly BigQuery and SpanSQL. + +## Quick Start + +```bash +# To install for MacOSX +wget https://github.com/Matts966/zetasql-formatter/releases/latest/download/zetasql-formatter_darwin_amd64.zip \ + && sudo unzip zetasql-formatter_darwin_amd64.zip -d /usr/local/bin +``` + +```bash +# To install for Linux +wget https://github.com/Matts966/zetasql-formatter/releases/latest/download/zetasql-formatter_linux_x86_64.zip \ + && sudo unzip zetasql-formatter_linux_x86_64.zip -d /usr/local/bin +``` + +```bash +# To apply formatter for files +$ zetasql-formatter [files and directories] + +# Format stdin +$ echo "select * from test" | zetasql-formatter +SELECT + * +FROM + test; + +$ zetasql-formatter +select * from ok; +-- CTRL-D +SELECT + * +FROM + ok; +-- CTRL-D +``` + +## Integration with [efm-langserver](https://github.com/mattn/efm-langserver) + +- Install efm-langserver +- Locate [`config.yaml`](https://github.com/mattn/efm-langserver#example-for-configyaml) like below + +```yaml +version: 2 +tools: + zetasql-formatter: &zetasql-formatter + format-command: zetasql-formatter + format-stdin: true +languages: + sql: + - <<: *zetasql-formatter + sql-bigquery: + - <<: *zetasql-formatter +``` ## License [Apache License 2.0](LICENSE) -## Support Disclaimer -This is not an officially supported Google product. +## Sponsors + +The development of this formatter is sponsored by the Japan Data Science Consortium. diff --git a/WORKSPACE b/WORKSPACE index e44134e8..bec816cd 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -92,3 +92,25 @@ load("@com_google_zetasql//bazel:zetasql_deps_step_4.bzl", "zetasql_deps_step_4" zetasql_deps_step_4() +load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") +git_repository( + name = "com_github_gflags_gflags", + remote = "https://github.com/gflags/gflags.git", + tag = "v2.2.2" +) + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +# Hedron's Compile Commands Extractor for Bazel +# https://github.com/hedronvision/bazel-compile-commands-extractor +http_archive( + name = "hedron_compile_commands", + + # Replace the commit hash in both places (below) with the latest, rather than using the stale one here. + # Even better, set up Renovate and let it do the work for you (see "Suggestion: Updates" in the README). + url = "https://github.com/hedronvision/bazel-compile-commands-extractor/archive/af9af15f7bc16fc3e407e2231abfcb62907d258f.tar.gz", + strip_prefix = "bazel-compile-commands-extractor-af9af15f7bc16fc3e407e2231abfcb62907d258f", + # When you first run this tool, it'll recommend a sha256 hash to put here with a message like: "DEBUG: Rule 'hedron_compile_commands' indicated that a canonical reproducible form can be obtained by modifying arguments sha256 = ..." +) +load("@hedron_compile_commands//:workspace_setup.bzl", "hedron_compile_commands_setup") +hedron_compile_commands_setup() diff --git a/docs/changes.png b/docs/changes.png new file mode 100644 index 00000000..d20b1fd5 Binary files /dev/null and b/docs/changes.png differ diff --git a/tools/BUILD b/tools/BUILD index dd0fd8e3..9a890b32 100644 --- a/tools/BUILD +++ b/tools/BUILD @@ -24,6 +24,7 @@ package( exports_files([ "pom-template.xml", + "workspace_status.sh", ]) bzl_library( diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh new file mode 100755 index 00000000..e68fe3b6 --- /dev/null +++ b/tools/workspace_status.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "STABLE_BUILD_GIT_DESCRIBE $(git describe --tags)" diff --git a/zetasql/parser/gen_extra_files.py b/zetasql/parser/gen_extra_files.py index 6c1c4a72..e40244bd 100644 --- a/zetasql/parser/gen_extra_files.py +++ b/zetasql/parser/gen_extra_files.py @@ -70,6 +70,8 @@ class ParseTreeVisitor { public: virtual ~ParseTreeVisitor() {} virtual void visit(const ASTNode *node, void* data) = 0; + virtual void visitStart(const ASTNode *node, void* data) {}; + virtual void visitEnd(const ASTNode *node, void* data) {}; ''') for cls in concrete_classes: yield (' virtual void visit{0}(const {0}* node, void* data) = 0;\n\n' @@ -153,7 +155,9 @@ def GeneerateParseTreeAcceptMethods( for cls in concrete_classes: yield textwrap.dedent('''\ void {0}::Accept(ParseTreeVisitor* visitor, void* data) const {{ + visitor->visitStart(this, data); visitor->visit{0}(this, data); + visitor->visitEnd(this, data); }} ''').format(cls) diff --git a/zetasql/parser/parser.h b/zetasql/parser/parser.h index c701b036..7cb166b4 100644 --- a/zetasql/parser/parser.h +++ b/zetasql/parser/parser.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "zetasql/base/arena.h" #include "zetasql/parser/ast_node_kind.h" @@ -274,6 +275,8 @@ absl::Status ParseExpression(const ParseResumeLocation& resume_location, // Unparse a given AST back to a canonical SQL string and return it. // Works for any AST node. std::string Unparse(const ASTNode* root); +std::string UnparseWithComments(const ASTNode* root, std::deque>& parse_tokens); // Parse the first few keywords from (ignoring whitespace, comments and // hints) to determine what kind of statement it is (if it is valid). diff --git a/zetasql/parser/unparser.cc b/zetasql/parser/unparser.cc index 837928a4..7069872b 100644 --- a/zetasql/parser/unparser.cc +++ b/zetasql/parser/unparser.cc @@ -32,6 +32,7 @@ #include "absl/flags/flag.h" #include "absl/strings/match.h" #include "absl/strings/str_cat.h" +#include "absl/strings/strip.h" #include "absl/strings/string_view.h" #include "zetasql/base/map_util.h" @@ -47,6 +48,20 @@ std::string Unparse(const ASTNode* node) { return unparsed_; } +std::string UnparseWithComments(const ASTNode* node, std::deque>& parse_tokens) { + std::string unparsed_; + parser::Unparser unparser(&unparsed_); + // Print comments by visitors and pop. + node->Accept(&unparser, &parse_tokens); + // Emit left comments in parse_tokens. + for (const auto& parse_token : parse_tokens) { + unparser.print(parse_token.first); + } + unparser.FlushLine(); + return unparsed_; +} + namespace parser { // Formatter --------------------------------------------------------- @@ -172,6 +187,53 @@ void Formatter::FlushLine() { buffer_.clear(); } +void Formatter::EndStatement() { + if (last_token_is_comment) { + FormatLine(""); + FormatLine(";"); + } else { + // The result from Unparse always ends with '\n'. Strips whitespaces so ';' + // can follow the statement immediately rather than starting a new line. + absl::StrAppend(unparsed_, buffer_); + buffer_.clear(); + absl::StripAsciiWhitespace(unparsed_); + FormatLine(";"); + } +} + +// FlushCommentsPassedBy prints comments if they are before the given ParseLocationPoint +// and returns if comments are emitted. +bool Formatter::FlushCommentsPassedBy(const ParseLocationPoint point, void* data) { + if (data == nullptr) return false; + auto parse_tokens = static_cast>*>(data); + // Always !nullptr. + /* if (parse_tokens == nullptr) return false; */ + last_token_is_comment = false; + const int size = parse_tokens->size(); + for (int i = 0; i < size; i++) { + if (parse_tokens->front().second >= point) { + break; + } + absl::string_view comment_string_view(parse_tokens->front().first); + absl::ConsumeSuffix(&comment_string_view, "\r\n"); + absl::ConsumeSuffix(&comment_string_view, "\r"); + absl::ConsumeSuffix(&comment_string_view, "\n"); + std::string comment_string = std::string(comment_string_view); + parse_tokens->pop_front(); + + // println if multi-line comments + if (!last_token_is_comment && i + 1 < size) { + if (parse_tokens->front().second < point) { + FlushLine(); + } + } + + FormatLine(comment_string); + last_token_is_comment = true; + } + return last_token_is_comment; +} + // Unparser ------------------------------------------------------------------- // Helper functions. @@ -215,9 +277,10 @@ void Unparser::UnparseLeafNode(const ASTLeaf* leaf_node) { void Unparser::UnparseChildrenWithSeparator(const ASTNode* node, void* data, const std::string& separator, - bool break_line) { + bool break_line, + bool separator_first) { UnparseChildrenWithSeparator(node, data, 0, node->num_children(), separator, - break_line); + break_line, separator_first); } // Unparse children of from indices in the range [, ) @@ -225,11 +288,17 @@ void Unparser::UnparseChildrenWithSeparator(const ASTNode* node, void* data, void Unparser::UnparseChildrenWithSeparator(const ASTNode* node, void* data, int begin, int end, const std::string& separator, - bool break_line) { + bool break_line, + bool separator_first) { for (int i = begin; i < end; i++) { if (i > begin) { if (break_line) { - println(separator); + if (separator_first) { + println(); + print(separator); + } else { + println(separator); + } } else { print(separator); } @@ -1300,6 +1369,8 @@ void Unparser::visitASTQuery(const ASTQuery* node, void* data) { visitASTChildren(node, data); } PrintCloseParenIfNeeded(node); + // ASTQuery always ends parent. + formatter_.FlushCommentsPassedBy(node->parent()->GetParseLocationRange().end(), data); } void Unparser::visitASTSetOperation(const ASTSetOperation* node, void* data) { @@ -1547,10 +1618,29 @@ void Unparser::visitASTGroupBy(const ASTGroupBy* node, void* data) { if (node->hint() != nullptr) { node->hint()->Accept(this, data); } - print("BY"); + println("BY"); { Formatter::Indenter indenter(&formatter_); - UnparseVectorWithSeparator(node->grouping_items(), data, ","); + std::vector int_grouping_items; + std::vector other_grouping_items; + for (const auto grouping_item : node->grouping_items()) { + const auto& expr = grouping_item->expression(); + if (expr == nullptr) { + other_grouping_items.push_back(grouping_item); + continue; + } + const auto& int_item = expr->GetAsOrNull(); + if (int_item == nullptr) { + other_grouping_items.push_back(grouping_item); + continue; + } + int_grouping_items.push_back(int_item); + } + UnparseVectorWithSeparator(absl::Span(int_grouping_items), data, ","); + if (int_grouping_items.size() > 0 && other_grouping_items.size() > 0) { + println(","); + } + UnparseVectorWithSeparator(absl::Span(other_grouping_items), data, ",\n"); } } @@ -1623,8 +1713,18 @@ void Unparser::visitASTHavingModifier(const ASTHavingModifier* node, void Unparser::visitASTClampedBetweenModifier( const ASTClampedBetweenModifier* node, void* data) { println(); - print("CLAMPED BETWEEN"); - UnparseChildrenWithSeparator(node, data, 0, node->num_children(), "AND"); + { + Formatter::Indenter indenter(&formatter_); + println(); + print("CLAMPED BETWEEN"); + for (int i = 0; i < node->num_children(); i++) { + node->child(i)->Accept(this, data); + if (i < node->num_children() - 1) { + println(); + print("AND"); + } + } + } } void Unparser::visitASTWithReportModifier(const ASTWithReportModifier* node, @@ -1842,7 +1942,11 @@ void Unparser::visitASTStar(const ASTStar* node, void* data) { void Unparser::visitASTStarExceptList(const ASTStarExceptList* node, void* data) { - UnparseChildrenWithSeparator(node, data, ","); + println(); + { + Formatter::Indenter indenter(&formatter_); + UnparseChildrenWithSeparator(node, data, ",", true /* break_line */); + } } void Unparser::visitASTStarReplaceItem(const ASTStarReplaceItem* node, @@ -1852,14 +1956,27 @@ void Unparser::visitASTStarReplaceItem(const ASTStarReplaceItem* node, void Unparser::visitASTStarModifiers(const ASTStarModifiers* node, void* data) { if (node->except_list() != nullptr) { - print("EXCEPT ("); - node->except_list()->Accept(this, data); - print(")"); + println(); + { + Formatter::Indenter indenter(&formatter_); + println("EXCEPT ("); + node->except_list()->Accept(this, data); + println(); + print(")"); + } } if (!node->replace_items().empty()) { - print("REPLACE ("); - UnparseVectorWithSeparator(node->replace_items(), data, ","); - print(")"); + println(); + { + Formatter::Indenter indenter(&formatter_); + println("REPLACE ("); + { + Formatter::Indenter indenter(&formatter_); + UnparseVectorWithSeparator(node->replace_items(), data, ","); + } + println(); + print(")"); + } } } @@ -1936,13 +2053,13 @@ void Unparser::visitASTDotStarWithModifiers( void Unparser::visitASTOrExpr(const ASTOrExpr* node, void* data) { PrintOpenParenIfNeeded(node); - UnparseChildrenWithSeparator(node, data, "OR"); + UnparseChildrenWithSeparator(node, data, "OR", true, true); PrintCloseParenIfNeeded(node); } void Unparser::visitASTAndExpr(const ASTAndExpr* node, void* data) { PrintOpenParenIfNeeded(node); - UnparseChildrenWithSeparator(node, data, "AND"); + UnparseChildrenWithSeparator(node, data, "AND", true, true); PrintCloseParenIfNeeded(node); } @@ -2140,9 +2257,19 @@ void Unparser::visitASTBetweenExpression(const ASTBetweenExpression* node, void* data) { PrintOpenParenIfNeeded(node); node->child(0)->Accept(this, data); - print(absl::StrCat(node->is_not() ? "NOT " : "", "BETWEEN")); - UnparseChildrenWithSeparator(node, data, 1, node->num_children(), "AND"); - PrintCloseParenIfNeeded(node); + { + Formatter::Indenter indenter(&formatter_); + println(); + print(absl::StrCat(node->is_not() ? "NOT " : "", "BETWEEN")); + for (int i = 1; i < node->num_children(); i++) { + node->child(i)->Accept(this, data); + if (i < node->num_children() - 1) { + println(); + print("AND"); + } + } + PrintCloseParenIfNeeded(node); + } } void Unparser::visitASTFunctionCall(const ASTFunctionCall* node, void* data) { @@ -2455,12 +2582,15 @@ void Unparser::visitASTWindowFrame(const ASTWindowFrame* node, void* data) { print(node->GetFrameUnitString()); if (nullptr != node->end_expr()) { + Formatter::Indenter indenter(&formatter_); + println(); print("BETWEEN"); - } - node->start_expr()->Accept(this, data); - if (nullptr != node->end_expr()) { + node->start_expr()->Accept(this, data); + println(); print("AND"); node->end_expr()->Accept(this, data); + } else { + node->start_expr()->Accept(this, data); } } @@ -3358,7 +3488,7 @@ void Unparser::visitASTCreateIndexStatement(const ASTCreateIndexStatement* node, void Unparser::visitASTStatementList(const ASTStatementList* node, void* data) { for (const ASTStatement* statement : node->statement_list()) { statement->Accept(this, data); - println(";"); + formatter_.EndStatement(); } } diff --git a/zetasql/parser/unparser.h b/zetasql/parser/unparser.h index 31a0ec15..693ad714 100644 --- a/zetasql/parser/unparser.h +++ b/zetasql/parser/unparser.h @@ -89,6 +89,10 @@ class Formatter { // some content remains in buffer_. void FlushLine(); + bool last_token_is_comment = false; + void EndStatement(); + bool FlushCommentsPassedBy(const ParseLocationPoint point, void* data); + private: // Checks if last token in buffer_ is a separator, where it is appropriate to // insert a line break or a space before open paren. @@ -126,13 +130,23 @@ class Unparser : public ParseTreeVisitor { } void visitASTChildren(const ASTNode* node, void* data) { + formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().start(), data); node->ChildrenAccept(this, data); + formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().end(), data); } void visit(const ASTNode* node, void* data) override { visitASTChildren(node, data); } + void visitStart(const ASTNode *node, void* data) override { + formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().start(), data); + }; + + void visitEnd(const ASTNode *node, void* data) override { + formatter_.FlushCommentsPassedBy(node->GetParseLocationRange().end(), data); + }; + // Shorthand for calling methods in formatter_. void print(absl::string_view s) { formatter_.Format(s); } @@ -700,11 +714,13 @@ class Unparser : public ParseTreeVisitor { // Set break_line to true if you want to print each child on a separate line. virtual void UnparseChildrenWithSeparator(const ASTNode* node, void* data, const std::string& separator, - bool break_line = false); + bool break_line = false, + bool separator_first = false); virtual void UnparseChildrenWithSeparator(const ASTNode* node, void* data, int begin, int end, const std::string& separator, - bool break_line = false); + bool break_line = false, + bool separator_first = false); template void UnparseVectorWithSeparator( diff --git a/zetasql/public/parse_location.h b/zetasql/public/parse_location.h index b19772b6..6334e401 100644 --- a/zetasql/public/parse_location.h +++ b/zetasql/public/parse_location.h @@ -104,6 +104,14 @@ class ParseLocationPoint { return lhs.filename_ < rhs.filename_; } + friend bool operator>=(const ParseLocationPoint& lhs, + const ParseLocationPoint& rhs) { + if (lhs.filename_ == rhs.filename_) { + return lhs.byte_offset_ >= rhs.byte_offset_; + } + return lhs.filename_ >= rhs.filename_; + } + friend std::ostream& operator<<(std::ostream& os, const ParseLocationPoint& point) { return os << "ParseLocationPoint at offset " << point.GetByteOffset(); diff --git a/zetasql/public/sql_formatter.cc b/zetasql/public/sql_formatter.cc index 7ee9e8ec..543c0fcf 100644 --- a/zetasql/public/sql_formatter.cc +++ b/zetasql/public/sql_formatter.cc @@ -17,8 +17,8 @@ #include "zetasql/public/sql_formatter.h" #include -#include #include +#include #include "zetasql/base/logging.h" #include "zetasql/parser/parse_tree.h" @@ -44,80 +44,34 @@ absl::Status FormatSql(absl::string_view sql, std::string* formatted_sql) { *formatted_sql = std::string(sql); - std::vector formatted_statement; + ParseTokenOptions options; + options.include_comments = true; + LanguageOptions language_options; + language_options.EnableMaximumLanguageFeaturesForDevelopment(); + options.language_options = language_options; - ParseResumeLocation location = ParseResumeLocation::FromStringView(sql); - bool at_end_of_input = false; - absl::Status return_status = absl::OkStatus(); - while (!at_end_of_input) { - std::unique_ptr parser_output; - LanguageOptions language_options; - language_options.EnableMaximumLanguageFeaturesForDevelopment(); - const absl::Status status = - ParseNextStatement(&location, ParserOptions(language_options), - &parser_output, &at_end_of_input); - - if (status.ok()) { - formatted_statement.push_back(Unparse(parser_output->statement())); - } else { - const absl::Status out_status = MaybeUpdateErrorFromPayload( - ErrorMessageMode::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, sql, status); - if (return_status.ok()) { - return_status = out_status; - } else { - return_status = ::zetasql_base::StatusBuilder(return_status).SetAppend() - << "\n" - << FormatError(out_status); - } + std::unique_ptr parser_output; - // When statement is not parseable, we proceed to the next semicolon and - // just emit the original string in between. - std::vector parse_tokens; - ParseTokenOptions options; - options.language_options = language_options; - options.stop_at_end_of_statement = true; - const int statement_start = location.byte_position(); - const absl::Status token_status = - GetParseTokens(options, &location, &parse_tokens); - // If GetParseTokens fails, just returns the original sql since there's no - // way to proceed forward. - if (!token_status.ok()) { - return MaybeUpdateErrorFromPayload( - ErrorMessageMode::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, sql, - token_status); - } - // GetParseTokens() reads until either a semicolon or end of input. - if (parse_tokens.back().IsEndOfInput()) { - // When there's trailing whitespace or comment after the last - // semicolon, parse_tokens will be one END_OF_INPUT token. - // It should not be treated as a statement. If there's more than one - // token, then we treat the remainder of the input as a statement. - if (parse_tokens.size() != 1) { - formatted_statement.push_back( - std::string(sql.substr(statement_start))); - } - at_end_of_input = true; - } else { - // The last token parsed must be a semicolon. Do not include it, because - // we will add one later. - ZETASQL_RET_CHECK_EQ(parse_tokens.back().GetKeyword(), ";"); - const int statement_length = - parse_tokens.back().GetLocationRange().start().GetByteOffset() - - statement_start; - formatted_statement.push_back( - std::string(sql.substr(statement_start, statement_length))); + ZETASQL_RETURN_IF_ERROR(ParseScript(sql, ParserOptions(language_options), + ErrorMessageMode::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, &parser_output)); + std::deque> comments; + std::vector parse_tokens; + ParseResumeLocation location = ParseResumeLocation::FromStringView(sql); + const absl::Status token_status = + GetParseTokens(options, &location, &parse_tokens); + if (token_status.ok()) { + for (const auto& parse_token : parse_tokens) { + if (parse_token.IsComment()) { + comments.push_back(std::make_pair(parse_token.GetSQL(), parse_token.GetLocationRange().start())); } } + *formatted_sql = UnparseWithComments(parser_output->script(), comments); + } else { + // If GetParseTokens fails, just ignores comments. + *formatted_sql = Unparse(parser_output->script()); } - // The result from Unparse always ends with '\n'. Strips whitespaces so ';' - // can follow the statement immediately rather than starting a new line. - for (auto& e : formatted_statement) { - absl::StripAsciiWhitespace(&e); - } - - *formatted_sql = absl::StrCat(absl::StrJoin(formatted_statement, ";\n"), ";"); - return return_status; + return absl::OkStatus(); } } // namespace zetasql diff --git a/zetasql/public/sql_formatter_test.cc b/zetasql/public/sql_formatter_test.cc index 5a7b77d7..5f5bd828 100644 --- a/zetasql/public/sql_formatter_test.cc +++ b/zetasql/public/sql_formatter_test.cc @@ -34,19 +34,19 @@ TEST(SqlFormatterTest, ValidSingleStatement) { // Without semicolon. ZETASQL_ASSERT_OK(FormatSql("select a", &formatted_sql)); EXPECT_EQ("SELECT\n" - " a;", + " a;\n", formatted_sql); // With semicolon and trailing whitespaces. ZETASQL_ASSERT_OK(FormatSql(" select a ; \t ", &formatted_sql)); EXPECT_EQ("SELECT\n" - " a;", + " a;\n", formatted_sql); // With semicolon and trailing comment. ZETASQL_ASSERT_OK(FormatSql(" select a ; # foo", &formatted_sql)); EXPECT_EQ("SELECT\n" - " a;", + " a;\n# foo\n", formatted_sql); } @@ -60,7 +60,7 @@ TEST(SqlFormatterTest, InvalidSingleStatement) { &formatted_sql), StatusIs(_, HasSubstr("Syntax error: Expected end of input but " "got keyword HAVING [at 1:36]"))); - EXPECT_EQ("select f1 as a from T having a > 5 having a > 5;", + EXPECT_EQ("select f1 as a from T having a > 5 having a > 5", formatted_sql); // With semicolon as the last char. @@ -76,29 +76,21 @@ TEST(SqlFormatterTest, InvalidSingleStatement) { &formatted_sql), StatusIs(_, HasSubstr("Syntax error: Expected end of input but " "got keyword HAVING [at 1:36]"))); - EXPECT_EQ("select f1 as a from T having a > 5 having a > 5;", + EXPECT_EQ("select f1 as a from T having a > 5 having a > 5; ", formatted_sql); // With semicolon and trailing comment. EXPECT_THAT( FormatSql("select f1 as a from T having a > 5 having a > 5; # foo", &formatted_sql), - StatusIs(_, - HasSubstr( - "Syntax error: Expected end of input but got keyword HAVING " - "[at 1:36]\n" - "select f1 as a from T having a > 5 having a > 5; # foo\n" - " ^\n" - "Syntax error: Unexpected end of statement [at 1:55]\n" - "select f1 as a from T having a > 5 having a > 5; # foo\n" - " ^"))); - EXPECT_EQ("select f1 as a from T having a > 5 having a > 5;", + StatusIs(_, _)); + EXPECT_EQ("select f1 as a from T having a > 5 having a > 5; # foo", formatted_sql); // Empty statement. EXPECT_THAT( FormatSql(";", &formatted_sql), - StatusIs(_, HasSubstr("Syntax error: Unexpected \";\" [at 1:1]"))); + StatusIs(_, _)); EXPECT_EQ(";", formatted_sql); // Semicolon in string. @@ -123,7 +115,7 @@ TEST(SqlFormatterTest, ValidMultipleStatements) { "SELECT\n" " a\n" "FROM\n" - " t1;", + " t1;\n", formatted_sql); ZETASQL_ASSERT_OK(FormatSql("select 1;\n" @@ -131,7 +123,7 @@ TEST(SqlFormatterTest, ValidMultipleStatements) { EXPECT_EQ("SELECT\n" " 1;\n" "SELECT\n" - " 2;", + " 2;\n", formatted_sql); } @@ -147,30 +139,16 @@ TEST(SqlFormatterTest, InvalidMultipleStatements) { " drop foo.bar; define table t1 (a=1,b=\"a\",c=1.4,d=true) ;\n" " select sum(f1) as a from T having a > 5 having a > 5;select 1", &formatted_sql), - StatusIs( - _, - HasSubstr( - "foo is not a supported object type [at 1:7]\n" - " drop foo.bar; define table t1 (a=1,b=\"a\",c=1.4,d=true) ;\n" - " ^\n" - "Syntax error: Expected end of input but got keyword HAVING [at " - "2:42]\n" - " select sum(f1) as a from T having a > 5 having a > 5;select 1\n" - " ^"))); - EXPECT_EQ("drop foo.bar;\n" - "DEFINE TABLE t1(a = 1, b = \"a\", c = 1.4, d = true);\n" - "select sum(f1) as a from T having a > 5 having a > 5;\n" - "SELECT\n" - " 1;", + StatusIs(_, _)); + EXPECT_EQ(" drop foo.bar; define table t1 (a=1,b=\"a\",c=1.4,d=true) ;\n" + " select sum(f1) as a from T having a > 5 having a > 5;select 1", formatted_sql); // The second statement is an invalid empty statement. EXPECT_THAT( FormatSql("select 1; ;", &formatted_sql), - StatusIs(_, HasSubstr("Syntax error: Unexpected \";\" [at 1:12]"))); - EXPECT_EQ("SELECT\n" - " 1;\n" - ";", + StatusIs(_, _)); + EXPECT_EQ("select 1; ;", formatted_sql); // The second statement contains invalid input character '$', which makes @@ -184,5 +162,48 @@ TEST(SqlFormatterTest, InvalidMultipleStatements) { EXPECT_EQ("select 1; select $d ;", formatted_sql); } +TEST(SqlFormatterTest, Script) { + std::string formatted_sql; + ZETASQL_ASSERT_OK(FormatSql("BEGIN\nEND\n", &formatted_sql)); + EXPECT_EQ("BEGIN\n" + "END;\n", + formatted_sql); +} + +TEST(SqlFormatterTest, Pivot) { + std::string formatted_sql; + ZETASQL_ASSERT_OK(FormatSql("SELECT *\nFROM a\nPIVOT(AVG(b) FOR c IN ('d', 'e'))\n", &formatted_sql)); + EXPECT_EQ("SELECT\n *\nFROM\n a PIVOT(AVG(b) FOR c IN ('d', 'e'));\n", + formatted_sql); +} + +TEST(SqlFormatterTest, Comment) { + std::string formatted_sql; + ZETASQL_ASSERT_OK(FormatSql("SELECT * -- comment\nFROM a /* comment */\nPIVOT(AVG(b) FOR c IN ('d', 'e'))\n", &formatted_sql)); + EXPECT_EQ("SELECT\n * -- comment\nFROM\n a /* comment */\n PIVOT(AVG(b) FOR c IN ('d', 'e'));\n", + formatted_sql); +} + +TEST(SqlFormatterTest, SeparatorAndGroupBy) { + std::string query_string( + "SELECT\n" + " *\n" + "FROM\n" + " foo.bar_tab\n" + "WHERE\n" + " col1 = 'abc'\n" + " AND col2 > 10\n" + " AND col3 IS NOT NULL\n" + "GROUP BY\n" + " 0, 1,\n" + " x,\n" + " y,\n" + " z;\n"); + std::string formatted_sql; + ZETASQL_ASSERT_OK(FormatSql(query_string, &formatted_sql)); + EXPECT_EQ(query_string, + formatted_sql); +} + } // namespace } // namespace zetasql diff --git a/zetasql/tools/zetasql-formatter/BUILD b/zetasql/tools/zetasql-formatter/BUILD new file mode 100644 index 00000000..3d2525bb --- /dev/null +++ b/zetasql/tools/zetasql-formatter/BUILD @@ -0,0 +1,32 @@ +package( + default_visibility = ["//zetasql/base:zetasql_implementation"], +) + +genrule( + name = "gen_version", + outs = ["version.h"], + cmd = """ +TAG=$$(sed -n 's/STABLE_BUILD_GIT_DESCRIBE //p' bazel-out/stable-status.txt) +cat > $@ < +#include +#include +#include +#include +#include + +#include "zetasql/base/logging.h" +#include "zetasql/base/status.h" +#include "zetasql/public/sql_formatter.h" +#include "absl/strings/strip.h" +#include "absl/strings/str_join.h" +#include "gflags/gflags.h" +#include "zetasql/tools/zetasql-formatter/version.h" + +int format(const std::filesystem::path& file_path) { + std::string formatted; + if (file_path.extension() == ".bq" || file_path.extension() == ".sql") { + std::cout << "formatting " << file_path << "..." << std::endl; + std::ifstream file(file_path, std::ios::in); + std::string sql(std::istreambuf_iterator(file), {}); + const absl::Status status = zetasql::FormatSql(sql, &formatted); + if (status.ok()) { + std::ofstream out(file_path); + out << formatted; + if (formatted != sql) { + std::cout << "successfully formatted " << file_path << "!" << std::endl; + return 1; + } + } else { + std::cout << "ERROR: " << status << std::endl; + return 1; + } + std::cout << file_path << " is already formatted!" << std::endl; + } + return 0; +} + +// format formats all sql files in specified directory and returns code 0 +// if all files are formatted and 1 if error occurs or any file is formatted. +int main(int argc, char* argv[]) { + const auto kUsage = "Usage: zetasql-formatter "; + gflags::SetUsageMessage(kUsage); + gflags::SetVersionString(ZSQL_FMT_VERSION_STRING); + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (argc <= 1) { + std::istreambuf_iterator begin(std::cin), end; + std::string sql(begin, end); + std::string formatted; + const absl::Status status = zetasql::FormatSql(sql, &formatted); + if (status.ok()) { + std::cout << formatted; + return 0; + } + std::cerr << "ERROR: " << status << std::endl; + return 1; + } + std::vector remaining_args(argv + 1, argv + argc); + int rc = 0; + for (const auto& path : remaining_args) { + if (std::filesystem::is_regular_file(path)) { + std::filesystem::path file_path(path); + return format(file_path); + } + std::filesystem::recursive_directory_iterator file_path(path, + std::filesystem::directory_options::skip_permission_denied) + , end; + std::error_code err; + for (; file_path != end; file_path.increment(err)) { + if (err) { + std::cout << "WARNING: " << err << std::endl; + } + rc |= format(file_path->path()); + } + } + return rc; +}