Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
KOLANICH committed Oct 13, 2023
0 parents commit d48fc29
Show file tree
Hide file tree
Showing 84 changed files with 7,841 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .ci/pythonPackagesToInstallFromGit.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
https://github.com/gruns/icecream.git
https://github.com/tomerfiliba/plumbum.git
https://github.com/KOLANICH-libs/rangeslicetools.py.git
https://github.com/UniGrammar/charRanges.py.git
https://github.com/UniGrammar/escapelib.py.git
https://github.com/KOLANICH-libs/urm.py.git
https://github.com/UniGrammar/UniGrammarRuntimeCore.py.git
https://github.com/UniGrammar/UniGrammarRuntime.py.git
https://github.com/KOLANICH-libs/transformerz.py.git
15 changes: 15 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
root = true

[*]
charset = utf-8
indent_style = tab
indent_size = 4
insert_final_newline = true
end_of_line = lf

[*.{yml,yaml,yug}]
indent_style = space
indent_size = 2

[grammars/*.txt]
insert_final_newline = false
1 change: 1 addition & 0 deletions .github/.templateMarker
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
KOLANICH/python_project_boilerplate.py
8 changes: 8 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
allow:
- dependency-type: "all"
15 changes: 15 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: CI
on:
push:
branches: [master]
pull_request:
branches: [master]

jobs:
build:
runs-on: ubuntu-22.04
steps:
- name: typical python workflow
uses: KOLANICH-GHActions/typical-python-workflow@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
__pycache__
*.pyc
*.pyo
*.pgt
*.dot
/UniGrammar.egg-info
/build
/dist
/.eggs
/tests/grammars
monkeytype.sqlite3
*.srctrlprj
*.srctrldb
*.srctrlbm
55 changes: 55 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
image: registry.gitlab.com/kolanich-subgroups/docker-images/fixed_python:latest

variables:
DOCKER_DRIVER: overlay2
SAST_ANALYZER_IMAGE_TAG: latest
SAST_DISABLE_DIND: "true"
SAST_CONFIDENCE_LEVEL: 5
CODECLIMATE_VERSION: latest

include:
- template: SAST.gitlab-ci.yml
- template: Code-Quality.gitlab-ci.yml

.build:
tags:
- shared
- linux
stage: build
interruptible: true
variables:
GIT_DEPTH: "1"
PYTHONUSERBASE: ${CI_PROJECT_DIR}/python_user_packages

before_script:
- export PATH="$PATH:$PYTHONUSERBASE/bin" # don't move into `variables`
#- git clone --depth=1 --filter=sparse:path=src/python https://github.com/waxeye-org/waxeye.git
- git clone --depth=1 https://github.com/waxeye-org/waxeye.git
- cd ./waxeye/src/python
- python3 ./setup.py bdist_wheel
- pip3 install --upgrade ./dist/*.whl
- cd ../../../

cache:
paths:
- /usr/local/site-packages
- /usr/local/lib/python*/site-packages

script:
- python3 setup.py bdist_wheel
- pip3 install --user --upgrade ./dist/*.whl
- cd ./tests
#- coverage run -a --branch --source=UniGrammar -m pytest --junitxml=./rspec.xml --forked ./test*.py
#- coverage report -m || true
#- coveralls || true
#- codecov || true
#- cd ..
- mkdir wheels
- mv ./dist/*.whl ./wheels/AptSourcesList-0.CI-py3-none-any.whl

artifacts:
paths:
- wheels
- $PYTHONUSERBASE
reports:
junit: ./rspec.xml
1 change: 1 addition & 0 deletions Code_Of_Conduct.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
No codes of conduct!
6 changes: 6 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include UNLICENSE
include *.md
include tests
global-include .editorconfig
global-include *.pgt
global-include *.pglr
109 changes: 109 additions & 0 deletions ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
UniGrammar.py [![Unlicensed work](https://raw.githubusercontent.com/unlicense/unlicense.org/master/static/favicon.png)](https://unlicense.org/)
=============
~~[wheel (GitLab)](https://gitlab.com/UniGrammar/UniGrammar.py/-/jobs/artifacts/master/raw/dist/UniGrammar-0.CI-py3-none-any.whl?job=build)~~
~~[wheel (GHA via `nightly.link`)](https://nightly.link/UniGrammar/UniGrammar.py/workflows/CI/master/UniGrammar-0.CI-py3-none-any.whl)~~
~~![GitLab Build Status](https://gitlab.com/UniGrammar/UniGrammar.py/badges/master/pipeline.svg)~~
~~![GitLab Coverage](https://gitlab.com/UniGrammar/UniGrammar.py/badges/master/coverage.svg)~~
~~[![GitHub Actions](https://github.com/UniGrammar/UniGrammar.py/workflows/CI/badge.svg)](https://github.com/UniGrammar/UniGrammar.py/actions/)~~
[![Libraries.io Status](https://img.shields.io/librariesio/github/UniGrammar/UniGrammar.py.svg)](https://libraries.io/github/UniGrammar/UniGrammar.py)
[![Code style: antiflash](https://img.shields.io/badge/code%20style-antiflash-FFF.svg)](https://codeberg.org/KOLANICH-tools/antiflash.py)

UniGrammar is a tool providing a unified [DSL](https://en.wikipedia.org/wiki/Domain-specific_language) for writing [grammars](https://en.wikipedia.org/wiki/Formal_grammar) for transpilation into grammar DSLs specific to other tools.

Why?
----

When you create a grammar you want to make it compatible to different parser generators because:

* it allows it to be reused;

* it allows you utilize debugging tools available only to some of them.

And it is possible since most of grammar DSLs implement [EBNF](https://en.wikipedia.org/wiki/EBNF).

How?
----
The general workflow is as follows (but feel fre to do as you feel convenient):
* Collect or craft samples of texts in the laguage you wanna parse. They should be convenient for testing. You usually need texts tsting each language feature separately, and then interactions between them. You either need a dir of them, if each text occupies multiple lines, or a file of them, if each text occupies a single line.
* Choose a parser generator **CONVENIENT** for you for implementing that grammar. The parser generator must have debugging tools sufficient for your task. It usually should be the most generic class, I mean GLR. You can downgrade the class later. For now your goal is to just develop the grammar, get familiar to it and make it work. I used [`parglare`](https://github.com/igordejanovic/parglare).
* Make sure the needed tools are installed:
* `UniGrammar` itself
* `UniGrammarRuntime`
* parser generator you want to support.
* `git`
* GUI diff and merge tool supporting `git` repos, such as `TortoiseGitMerge`, `WinMerge` (for Windows only) or `meld`.
* Setup your working dir:
* Clone `https://codeberg.org/UniGrammar/grammars` and read its `ReadMe`.
* Find a dir in the repo matching the purpose of the language you want to parse. Create a subdir there for your language. `cd` into it.
* Develop and debug a grammar for the selected parser generator. Make it work. Use debug tools, such as tracers and AST visualizers to make sure it works as intended. Commit it.
* Make an initial port of your grammar to `UniGrammar`:
* Translate it to `grammar.yug`. For now just copy ad then manually translate. In future automatic assistance can be developed.
* Use `UniGrammar transpile <yug file> <backend name>` to transpile it into a grammar for the backend of your choice.
* Compare the generated spec to the one you have originally crafted. Make minor insignificant changes to the both specs to make them byte-by-byte identical, keeping the original spec working.
* Set up testing:
* register the tests in your `yug` file
* run `UniGrammar test <yug file> <backend name>` and make sure all the tests pass. This tests mean only that a source is pased without an issue. If they don't pass, fix the grammar.
* Make compatibility to the rest of backends, downgrading grammar class step-by-step. Modify the `yug` file and test untill it works for a backend. Bring compatibility to all the backends.
* You get an universal grammar suitable for more than 1 backends. Now it's time for deployment and behavioral tests.
* generate a bundle using `UniGrammar gen-bundle <yug file> <backend name>`
* Import runtime `from UniGrammarRuntime.ParserBundle import ParserBundle`
* `b = ParserBundle(Path("path/to/bundle"))`
* `w = b["your_grammar_name"].getWrapper()`
* `parseTree = w("text to parse")`

Guidelines
----------
* An `*.*ug` file is a machine readable and writeable universal grammar file. It is a tree of serialized objects like the ones that can be serialized into JSON. `ug` stands for UniGrammar. It is prepended by a letter:
* `y` stands for YAML
* `j` stands for JSON
* `p` stands for PON - "Pyhon Object Notation" that can be parsed securely using `ast.literal_eval`
* `*b` stands for `binary`. Prepended by a letter identifying a binary format.
* `c` - cbor
* `m` - msgpack
* `o` - own format

* An `*.*ug` file consists of 4 sections, each of them is a `list` of records:
* `characters` for definition of character classes. Needed because of CoCo/R.
* `keywords` - put there whole words that are reserved. Anything that identical to these words will be recognized as these words tokens.
* `tokens` - consist of groups of `characters`. Cannot group other tokens and productions.
* `fragmented` and `productions` - are productions resolved via a state machine. They are mostly the same, but they have big semantic difference, related to wrapper generated from them:
* `fragmented` are considered to be simple text strings. They should never `cap`. It is an artificial class to support scannerful LL parsers. Scannerful LL parsers work from character classes. They split text into tokens and assign a type to each token based on character classes used in it, then do productions, and they never backtrace and the tokenizer doesn't know the context. This means token character classes in general should never overlap, otherwise the tokens may be wrong. So to support "tokens" with overlapping char ranges one splits them into tokens of non-overlapping char ranges, and these "tokens" are not tokens anymore, but productions. But they still have meaning of tokens. This section is for such "tokens". The postprocessor (can be automatically generated) should join them back into strings. Also their internal structure may be optimized out, or the backends it makes sense.
* `productions` - usual productions, that must always contain at least 1 `cap` (otherwise they belong to `fragmened`, if you get invalid python code, you probably have put something that must be in `fragmented` to `productions`), defining named refs to parse tree children subnodes.

* use `id: <id>` to assign an id to each rule. It must he done for rules in sections.
* use `ref: <assigned id>` to refer an already created rule.
* use `alt: […]` to specify alternatives. **Works for all the sections.** For `chars` allows to enumerate characters.
* use `range: ['<start>', '<stop>']` to create a character range. `[<start>-<stop>]` in regexp syntax.
* use `wellknown: <name>` to specify a group of characters with a well-known name.
* use `neg: true` if the chars are to be excluded.
* use `lit: ...` to add a literal or a single character.
* use `min` to mark iteration. `min: 0` is transpiled to `…*` (`{…}`), `min: 1` is transpiled to `…+` (`… {…}`) in parglare (EBNF) syntaxes.
* use `opt` to mark optionality. It is transpiled to `…?` (`[…]`).
* use `seq: […]` to create a sequence.
* use `cap: <name>` to put the contents of this rule into the parse tree, if it is constructed.
* use `prefer: shift | reduce` to set a preferrence for `parglare`.
* use `spacer: <n>` to add `n` empty lines.

* use `name` in the root to specify a grammar name.

Here is an example: https://codeberg.org/KOLANICH-libs/AptSourcesList.py/blob/master/grammar.yug

Implemented backends
--------------------
In the order of decreasing performance:
* [parsimonious](https://github.com/erikrose/parsimonious)
* [waxeye](https://github.com/waxeye-org/waxeye) (PEG)
* [ANTLR 4](https://github.com/antlr/antlr4) (LL(*))
* [parglare](https://github.com/igordejanovic/parglare) (LR, GLR)
* [TatSu](https://github.com/neogeny/TatSu)


Not fully implemented backends
-------------------------------
* [CoCo/R](https://github.com/armornick/CocoR) and [CocoPy](https://codeberg.org/UniGrammar/CoCoPy) (LL(1))


Dependencies
------------
* [`rangeslicetools`](https://codeberg.org/KOLANICH-libs/rangeslicetools.py) - for computations with chars ranges
* [`plumbum`](https://github.com/tomerfiliba/plumbum) - for CLI
24 changes: 24 additions & 0 deletions UNLICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.

Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.

In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

For more information, please refer to <https://unlicense.org/>
58 changes: 58 additions & 0 deletions UniGrammar/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""UniGrammar is a tool and a lib to deal with parser generators uniformly"""

import typing
import warnings
from copy import deepcopy
from pathlib import Path

from .core.ast import Grammar
from .core.backend.Generator import Generator, TranspiledResult
from .ownGrammarFormat import parseUniGrammarFile


class GrammarTranspilationResults: # pylint: disable=too-few-public-methods
"""Represents transpilation results of an unigrammar: mainly a transpiled grammar"""

__slots__ = ("grammar", "backendResultMapping")

def __init__(self, grammar: Grammar, backendResultMapping: typing.Dict[typing.Any, TranspiledResult]) -> None:
self.grammar = grammar
self.backendResultMapping = backendResultMapping


def transpile(grammar: Grammar, backend: Generator) -> str:
"""Transpiles a unigrammar into backend-specific grammar"""
ctx = backend.initContext(grammar)
backend.preprocessGrammar(grammar, ctx)
lines = backend._transpile(grammar, ctx)

return TranspiledResult(grammar.meta.id, "\n".join(lines))


def _transpileGrammarForGenerators(gr: Grammar, backends: typing.Iterable[Generator]) -> typing.Iterator[typing.Tuple[Generator, TranspiledResult]]:
for backend in backends:
yield backend, transpile(deepcopy(gr), backend) # during transpilation AST is modified, so we need a fresh copy


def transpileGrammarForGenerators(gr: Grammar, backends: typing.Iterable[Generator]) -> GrammarTranspilationResults:
"""Just transpiles a unigrammar for multiple backends"""
return GrammarTranspilationResults(gr, dict(_transpileGrammarForGenerators(gr, backends)))


def transpileFileForGenerators(grammarFile: Path, backends: typing.Iterable[Generator]) -> GrammarTranspilationResults:
"""Just transpiles a unigrammar for multiple backends"""
gr = parseUniGrammarFile(grammarFile) # during transpilation AST is modified, so we need a fresh copy
return transpileGrammarForGenerators(gr, backends)


def transpileFilesForGenerators(files: typing.Iterable[Path], backends: typing.Iterable[Generator]) -> typing.Iterable[typing.Tuple[Path, GrammarTranspilationResults]]:
"""Just transpiles multiple unigrammar files for multiple backends"""
for file in files:
yield file, transpileFileForGenerators(file, backends)


def saveTranspiled(transpiledFiles: typing.Dict[Path, GrammarTranspilationResults], outputDir: Path) -> None:
"""Saves transpiled grammars (retured by `transpileFilesForGenerators` into files"""
for transpiled in transpiledFiles.values():
for backend, transpiledResult in transpiled.backendResultMapping.items():
(outputDir / (transpiledResult.id + "." + backend.META.mainExtension)).write_text(transpiledResult.text, encoding="utf-8")
Loading

0 comments on commit d48fc29

Please sign in to comment.