Skip to content

Releases: antlr/codebuff

1.5.1

27 Jun 23:01
Compare
Choose a tag to compare

Some re-factoring, updated the readme, set the version number properly to 1.5.1.

1.5

27 Jun 17:36
Compare
Choose a tag to compare
1.5

This release snapshots the repository software, including the corpora and formatted output for all corpora, for an upcoming academic paper.

1.3

13 Apr 00:45
Compare
Choose a tag to compare
1.3

Added sqlite examples from their test suite, fixed but that char pos not set for first token.

1.2

08 Apr 20:24
Compare
Choose a tag to compare
1.2
  • track repeated child tokens like we do for sibling lists of subtrees.
  • got Java stability test going. compares misclassifications now not whitespace edit distance.
  • Better token analysis info tracking too.
  • Some refactoring concerning comments. It was spitting out too many newlines.

Does good job on Java and ANTLR grammars

07 Apr 17:07
Compare
Choose a tag to compare

Did a major reboot of features used for context. Super accurate context info now. Also combined whitespace / newline into single prediction.

seems to do java well. only tweaked model on a single grammar

03 Apr 17:15
Compare
Choose a tag to compare

This release was then tested against a completely unknown grammar, ANTLR's grammar itself. 3 grammars as a corpus. The unknown was a grammar not part of that corpus (clojure grammar). codebuff starts with all tokens of the unknown on a single line with no whitespace at all and this was the result.

It isn't great but a good test of generality.

[edit: ah. It looks as if there were some syntax errors indicating my ANTLR metalanguage grammar is not correct, which could explain some of the less than optimal formatting here.]

grammar Clojure;


file
    :   form*
;

form
    :   literal
    |   list
    |   vector
    |   map
    |   reader_macro
;

forms
    :   form*
;

list
    :   '(' forms ')'
;

vector
    :   '[' forms ']'
;

map
    :   '{' (form form)* '}'
;

set
    :   '#{' forms '}'
;

reader_macro
    :   lambda
    |   meta_data
    |   regex
    |   var_quote
    |   host_expr
    |   set
    |   tag
    |   discard
    |   dispatch
    |   deref
    |   quote
    |   backtick
    |   unquote
    |   unquote_splicing
    |   gensym
;

quote
    :   '\'' form
;

backtick
    :   '`' form
;

unquote
    :   '~' form
;

unquote_splicing
    :   '~@' form
;

tag
    :   '^' form form
;

deref
    :   '@' form
;

gensym
    :   SYMBOL '#'
;

lambda
    :   '#(' form* ')'
;

meta_data
    :   '#^' (map form| form)
    ;

var_quote
    :   '#\'' symbol
;

host_expr
    :   '#+' form form
;

discard
    :   '#_' form
;

dispatch
    :   '#' symbol form
;

regex
    :   '#' string
;

literal
    :   string
    |   number
    |   character
    |   nil
    |   BOOLEAN
    |   keyword
    |   symbol
    |   param_name
;

string
    :   STRING
;

hex
    :   HEX
;

bin
    :   BIN
;

bign
    :   BIGN
;

number
    :   FLOAT
    |   hex
    |   bin
    |   bign
    |   LONG
;

character
    :   named_char
    |   u_hex_quad
    |   any_char
;

named_char
    :   CHAR_NAMED
;

any_char
    :   CHAR_ANY
;

u_hex_quad
    :   CHAR_U
;

nil
    :   NIL
;

keyword
    :   macro_keyword
    |   simple_keyword
;

simple_keyword
    :   ':' symbol
;

macro_keyword
    :   ':' ':' symbol
;

symbol
    :   ns_symbol
    |   simple_sym
;

simple_sym
    :   SYMBOL
;

ns_symbol
    :   NS_SYMBOL
;

param_name
    :   PARAM_NAME
;





STRING         :   '"' (~'"'| '\\' '"')* '"';
FLOAT         : '-'? [0-9]+FLOAT_TAIL
              |   '-'? 'Infinity'
              |   '-'? 'NaN'
;

fragment
FLOAT_TAIL
    :   FLOAT_DECIMAL FLOAT_EXP
    |   FLOAT_DECIMAL
    |   FLOAT_EXP
;

fragment
FLOAT_DECIMAL
    :   '.' [0-9]+
;

fragment
FLOAT_EXP
    :   [eE]'-'? [0-9]+
;

fragment
HEXD
    :   [0-9a-fA-F]
;
HEX         :   '0' [xX] HEXD+
;
BIN         :   '0' [bB] [10]+;
LONG         : '-'? [0-9]+ [lL]?;
BIGN         : '-'? [0-9]+ [nN];
CHAR_U         :   '\\' 'u' [0-9D-Fd-f] HEXD HEXD HEXD;
CHAR_NAMED         :   '\\' ('newline'|'return'|'space'|'tab'|'formfeed'|'backspace');
CHAR_ANY         :   '\\'.;
NIL         : 'nil';
BOOLEAN         :   'true'
                |   'false'
;
SYMBOL         :   '.'
               |   '/'
               |   NAME
;
NS_SYMBOL         :   NAME '/' SYMBOL;
PARAM_NAME         :   '%' ((   ('1' ..'9') ('0' ..'9')*)
|'&')?
;

fragment
NAME
    :   SYMBOL_HEAD SYMBOL_REST* (':'SYMBOL_REST+)*
;

fragment
SYMBOL_HEAD
    :   ~ ('0'
           ..
               '9'
|   '^'
|   '`'
|   '\''
|   '"'
|   '#'
|   '~'
|   '@'
|   ':'
|   '/'
|   '%'
|   '('
|   ')'
|   '['
|   ']'
|   '{'
|   '}'
|[ \n\r\t\,])
;

fragment
SYMBOL_REST
    :   SYMBOL_HEAD
    |   '0'
..   '9'
    |   '.'
;

fragment
WS
    :   [ \n\r\t\,]
;

fragment
COMMENT
    :   ';'~[\r\n]*
;
TRASH         :   (WS| COMMENT) -> channel
(HIDDEN)
;

Ok, small tweak to lex with proper lexer. ;) Here is the actual output of this test:

grammar Clojure;

file:   form*
;

form: literal
    | list
    | vector
    | map
    |   reader_macro
;

forms:   form*
;

list: '(' forms ')' ;

vector: '[' forms ']' ;

map: '{' (form form)* '}' ;

set: '#{' forms '}' ;

reader_macro: lambda
            | meta_data
            | regex
            | var_quote
            | host_expr
            | set
            | tag
            | discard
            | dispatch
            | deref
            | quote
            | backtick
            | unquote
            | unquote_splicing
            |   gensym
;

quote: '\'' form
;

backtick: '`' form
;

unquote: '~' form
;

unquote_splicing: '~@' form
;

tag: '^' form form
;

deref: '@' form
;

gensym:   SYMBOL '#' ;

lambda: '#(' form* ')' ;

meta_data: '#^' (map form| form)
    ;

var_quote: '#\'' symbol
;

host_expr: '#+' form form
;

discard: '#_' form
;

dispatch: '#' symbol form
;

regex: '#' string
;

literal: string
       | number
       | character
       | nil
       |   BOOLEAN
       | keyword
       | symbol
       |   param_name
;

string: STRING;
hex: HEX;
bin: BIN;
bign: BIGN;
number: FLOAT
      | hex
      | bin
      | bign
      |   LONG
;
character: named_char
         | u_hex_quad
         |   any_char
;

named_char: CHAR_NAMED;
any_char: CHAR_ANY;
u_hex_quad: CHAR_U;
nil: NIL;
keyword: macro_keyword
       |   simple_keyword
;

simple_keyword: ':' symbol
;

macro_keyword: ':' ':' symbol
;

symbol: ns_symbol
      |   simple_sym
;

simple_sym: SYMBOL;
ns_symbol: NS_SYMBOL;
param_name: PARAM_NAME;




STRING: '"' ( ~'"' | '\\' '"')* '"';
FLOAT: '-'? [0-9]+ FLOAT_TAIL
     |   '-'? 'Infinity'
     |   '-'? 'NaN'
;

fragment
FLOAT_TAIL
    :   FLOAT_DECIMAL FLOAT_EXP
    |   FLOAT_DECIMAL
    |   FLOAT_EXP
;

fragment
FLOAT_DECIMAL
    : '.' [0-9]+
;

fragment
FLOAT_EXP
    :   [eE] '-'? [0-9]+
;

fragment
HEXD
    :   [0-9a-fA-F]
;
HEX: '0' [xX] HEXD+ ;
BIN: '0' [bB] [10]+ ;
LONG: '-'? [0-9]+ [lL]? ;
BIGN: '-'? [0-9]+[nN];
CHAR_U: '\\' 'u' [0-9D-Fd-f] HEXD HEXD HEXD;

CHAR_NAMED: '\\' ( 'newline'
| 'return'
| 'space'
| 'tab'
| 'formfeed'
| 'backspace') ;
CHAR_ANY: '\\' . ;

NIL: 'nil';
BOOLEAN: 'true'
       |   'false'
;
SYMBOL: '.'
      | '/'
      |   NAME
;

NS_SYMBOL:   NAME '/' SYMBOL;

PARAM_NAME: '%' ((('1'..'9')('0'..'9')*)| '&')?;

fragment
NAME
    :   SYMBOL_HEAD SYMBOL_REST* (':' SYMBOL_REST+)*
;

fragment
SYMBOL_HEAD
    :   ~('0'.. '9'
        | '^' | '`' | '\'' | '"' | '#' | '~' | '@' | ':' | '/' | '%' | '(' | ')' | '[' | ']' | '{' | '}' | [ \n\r\t\,]
)
    ;

fragment
SYMBOL_REST
    :   SYMBOL_HEAD
    | '0'..'9'
    |   '.'
;

fragment
WS
    :   [ \n\r\t\,]
;

fragment
COMMENT
    : ';' ~[\r\n]*
;
TRASH: ( WS
| COMMENT) -> channel(HIDDEN)
;